Skip to content

Commit

Permalink
chore
Browse files Browse the repository at this point in the history
  • Loading branch information
zobinHuang committed Dec 6, 2024
1 parent e2e1269 commit 413c7e1
Show file tree
Hide file tree
Showing 18 changed files with 167 additions and 67 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ assets
dockerfiles/assets
third_party/libclang-static-build
third_party/libclang-static-build.tar.gz

# ignore exps. results
osdi_data
9 changes: 9 additions & 0 deletions autogen/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,15 @@ if conf_runtime_enable_trace != 0 and conf_runtime_enable_trace != 1
)
endif

# whether to trace the memory statistics
conf_runtime_enable_memory_trace = run_command('sh', '-c', 'echo $POS_BUILD_CONF_RuntimeEnableMemoryTrace').stdout().strip().to_int()
if conf_runtime_enable_memory_trace != 0 and conf_runtime_enable_memory_trace != 1
assert(
false,
'conf_runtime_enable_memory_trace get invalid value: ' + conf_runtime_enable_memory_trace.to_string()
)
endif

# log path of PhOS daemon
conf_runtime_default_daemon_log_path = run_command('sh', '-c', 'echo $POS_BUILD_CONF_RuntimeDefaultDaemonLogPath').stdout().strip()
if conf_runtime_default_daemon_log_path == ''
Expand Down
1 change: 0 additions & 1 deletion examples/ppo/client_exist.txt

This file was deleted.

9 changes: 9 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,15 @@ if conf_runtime_enable_trace != 0 and conf_runtime_enable_trace != 1
)
endif

# whether to trace the memory statistics
conf_runtime_enable_memory_trace = run_command('sh', '-c', 'echo $POS_BUILD_CONF_RuntimeEnableMemoryTrace').stdout().strip().to_int()
if conf_runtime_enable_memory_trace != 0 and conf_runtime_enable_memory_trace != 1
assert(
false,
'conf_runtime_enable_memory_trace get invalid value: ' + conf_runtime_enable_memory_trace.to_string()
)
endif

# log path of PhOS daemon
conf_runtime_default_daemon_log_path = run_command('sh', '-c', 'echo $POS_BUILD_CONF_RuntimeDefaultDaemonLogPath').stdout().strip()
if conf_runtime_default_daemon_log_path == ''
Expand Down
9 changes: 9 additions & 0 deletions pos/cli/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,15 @@ if conf_runtime_enable_trace != 0 and conf_runtime_enable_trace != 1
)
endif

# whether to trace the memory statistics
conf_runtime_enable_memory_trace = run_command('sh', '-c', 'echo $POS_BUILD_CONF_RuntimeEnableMemoryTrace').stdout().strip().to_int()
if conf_runtime_enable_memory_trace != 0 and conf_runtime_enable_memory_trace != 1
assert(
false,
'conf_runtime_enable_memory_trace get invalid value: ' + conf_runtime_enable_memory_trace.to_string()
)
endif

# log path of PhOS daemon
conf_runtime_default_daemon_log_path = run_command('sh', '-c', 'echo $POS_BUILD_CONF_RuntimeDefaultDaemonLogPath').stdout().strip()
if conf_runtime_default_daemon_log_path == ''
Expand Down
14 changes: 14 additions & 0 deletions pos/cuda_impl/handle/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,20 @@ class POSHandleManager_CUDA_Memory : public POSHandleManager<POSHandle_CUDA_Memo
pos_retval_t try_restore_from_pool(POSHandle_CUDA_Memory* handle) override;


/* =========================== metric system ============================= */
public:
#if POS_CONF_RUNTIME_EnableTrace
enum metrics_ticker_type_t : uint8_t {
__TICKER_BASE__ = 0,
RESTORE_reload_state
};
POSMetrics_TickerList<metrics_ticker_type_t> metric_tickers;

void print_metrics() override;
#endif
/* =========================== metric system ============================= */


private:
/*!
* \brief restore the extra fields of handle with specific type
Expand Down
21 changes: 21 additions & 0 deletions pos/cuda_impl/handle/module.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,27 @@ class POSHandleManager_CUDA_Module : public POSHandleManager<POSHandle_CUDA_Modu
*/
pos_retval_t try_restore_from_pool(POSHandle_CUDA_Module* handle) override;


/* =========================== metric system ============================= */
public:
#if POS_CONF_RUNTIME_EnableTrace
enum metrics_ticker_type_t : uint8_t {
__TICKER_BASE__ = 0,
RESTORE_reload_state
};
POSMetrics_TickerList<metrics_ticker_type_t> metric_tickers;

enum metrics_reducer_type_t : uint8_t {
__REDUCER_BASE__= 0,
RESTORE_nb_reload_functions
};
POSMetrics_ReducerList<metrics_reducer_type_t, uint64_t> metric_reducers;

void print_metrics() override;
#endif
/* =========================== metric system ============================= */


private:
/*!
* \brief restore the extra fields of handle with specific type
Expand Down
8 changes: 6 additions & 2 deletions pos/cuda_impl/src/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,14 @@ pos_retval_t POSClient_CUDA::init_transport(){

void POSClient_CUDA::deinit_handle_managers(){
#if POS_CONF_RUNTIME_EnableTrace
POSHandleManager<POSHandle> *hm_memory;
POSHandleManager_CUDA_Memory *hm_memory;
POSHandleManager_CUDA_Module *hm_module;

POS_CHECK_POINTER(hm_memory = pos_get_client_typed_hm(this, kPOS_ResourceTypeId_CUDA_Memory, POSHandleManager_CUDA_Memory));
POS_CHECK_POINTER(hm_module = pos_get_client_typed_hm(this, kPOS_ResourceTypeId_CUDA_Module, POSHandleManager_CUDA_Module));

POS_CHECK_POINTER(hm_memory = pos_get_client_typed_hm(this, kPOS_ResourceTypeId_CUDA_Memory, POSHandleManager<POSHandle>));
hm_memory->print_metrics();
hm_module->print_metrics();
#endif

this->__dump_hm_cuda_functions();
Expand Down
35 changes: 22 additions & 13 deletions pos/cuda_impl/src/handle/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,6 @@ pos_retval_t POSHandle_CUDA_Memory::__commit(uint64_t version_id, uint64_t strea
cudaError_t cuda_rt_retval;
POSCheckpointSlot *ckpt_slot, *cow_ckpt_slot;

#if POS_CONF_RUNTIME_EnableTrace
((POSHandleManager_CUDA_Memory*)(this->_hm))->metric_tickers.start(
POSHandleManager_CUDA_Memory::CKPT_commit
);
#endif

// TODO: [zhuobin] why we have this call??
cudaSetDevice(0);

Expand Down Expand Up @@ -272,12 +266,6 @@ pos_retval_t POSHandle_CUDA_Memory::__commit(uint64_t version_id, uint64_t strea
}
}

#if POS_CONF_RUNTIME_EnableTrace
((POSHandleManager_CUDA_Memory*)(this->_hm))->metric_tickers.end(
POSHandleManager_CUDA_Memory::CKPT_commit
);
#endif

exit:
return retval;
}
Expand Down Expand Up @@ -440,6 +428,10 @@ pos_retval_t POSHandle_CUDA_Memory::__reload_state(void* mapped, uint64_t ckpt_f
}
POS_CHECK_POINTER(memory_binary.mutable_base());

#if POS_CONF_RUNTIME_EnableTrace
((POSHandleManager_CUDA_Memory*)(this->_hm))->metric_tickers.start(POSHandleManager_CUDA_Memory::RESTORE_reload_state);
#endif

cuda_rt_retval = cudaMemcpyAsync(
/* dst */ this->server_addr,
/* src */ reinterpret_cast<const void*>(memory_binary.mutable_base()->state().c_str()),
Expand All @@ -460,6 +452,10 @@ pos_retval_t POSHandle_CUDA_Memory::__reload_state(void* mapped, uint64_t ckpt_f
goto exit;
}

#if POS_CONF_RUNTIME_EnableTrace
((POSHandleManager_CUDA_Memory*)(this->_hm))->metric_tickers.end(POSHandleManager_CUDA_Memory::RESTORE_reload_state);
#endif

exit:
// this should be the end of using this mmap area, so we release it here
munmap(mapped, ckpt_file_size);
Expand Down Expand Up @@ -750,4 +746,17 @@ pos_retval_t POSHandleManager_CUDA_Memory::__reallocate_single_handle(void* mapp

exit:
return retval;
}
}


void POSHandleManager_CUDA_Memory::print_metrics() {
static std::unordered_map<metrics_ticker_type_t, std::string> ticker_names = {
{ RESTORE_reload_state, "Restore State" }
};
POS_ASSERT(pos_resource_map.count(this->_rid) > 0);
POS_LOG(
"[HandleManager Metrics] %s:\n%s",
pos_resource_map[this->_rid].c_str(),
this->metric_tickers.str(ticker_names).c_str()
);
}
29 changes: 29 additions & 0 deletions pos/cuda_impl/src/handle/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ pos_retval_t POSHandle_CUDA_Module::__reload_state(void* mapped, uint64_t ckpt_f
}
POS_CHECK_POINTER(module_binary.mutable_base());

#if POS_CONF_RUNTIME_EnableTrace
((POSHandleManager_CUDA_Module*)(this->_hm))->metric_tickers.start(POSHandleManager_CUDA_Module::RESTORE_reload_state);
#endif

cuda_dv_retval = cuModuleLoadData(
/* module */ &module,
/* image */ reinterpret_cast<const void*>(module_binary.mutable_base()->state().c_str())
Expand All @@ -181,6 +185,14 @@ pos_retval_t POSHandle_CUDA_Module::__reload_state(void* mapped, uint64_t ckpt_f
goto exit;
}

#if POS_CONF_RUNTIME_EnableTrace
((POSHandleManager_CUDA_Module*)(this->_hm))->metric_tickers.end(POSHandleManager_CUDA_Module::RESTORE_reload_state);
((POSHandleManager_CUDA_Module*)(this->_hm))->metric_reducers.reduce(
POSHandleManager_CUDA_Module::RESTORE_nb_reload_functions,
this->function_desps.size()
);
#endif

this->set_server_addr((void*)module);
this->mark_status(kPOS_HandleStatus_Active);
this->mark_state_status(kPOS_HandleStatus_StateReady);
Expand Down Expand Up @@ -443,3 +455,20 @@ pos_retval_t POSHandleManager_CUDA_Module::__reallocate_single_handle(void* mapp
exit:
return retval;
}


void POSHandleManager_CUDA_Module::print_metrics() {
static std::unordered_map<metrics_ticker_type_t, std::string> ticker_names = {
{ RESTORE_reload_state, "Restore State" }
};
static std::unordered_map<metrics_reducer_type_t, std::string> reducer_names = {
{ RESTORE_nb_reload_functions, "# Restored Functions" }
};
POS_ASSERT(pos_resource_map.count(this->_rid) > 0);
POS_LOG(
"[HandleManager Metrics] %s:\n%s\n%s",
pos_resource_map[this->_rid].c_str(),
this->metric_tickers.str(ticker_names).c_str(),
this->metric_reducers.str(reducer_names).c_str()
);
}
17 changes: 1 addition & 16 deletions pos/include/handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -818,22 +818,7 @@ class POSHandleManager {
/* =========================== metric system ============================= */
public:
#if POS_CONF_RUNTIME_EnableTrace
enum metrics_ticker_type_t : uint8_t {
CKPT_commit = 0,
};
POSMetrics_TickerList<metrics_ticker_type_t> metric_tickers;

inline void print_metrics(){
std::unordered_map<metrics_ticker_type_t, std::string> ticker_names = {
{ CKPT_commit, "CKPT_commit (GPU Memory -> CPU Memory)" }
};
POS_ASSERT(pos_resource_map.count(this->_rid) > 0);
POS_LOG(
"[HandleManager Metrics] %s:\n%s",
pos_resource_map[this->_rid].c_str(),
this->metric_tickers.str(ticker_names).c_str()
);
}
virtual void print_metrics(){}
#endif
/* =========================== metric system ============================= */

Expand Down
1 change: 1 addition & 0 deletions pos/include/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ runtime_conf.set('conf_runtime_default_client_log_path', conf_runtime_default_cl
runtime_conf.set('conf_runtime_enable_debug_check', conf_runtime_enable_debug_check)
runtime_conf.set('conf_runtime_enable_hijack_api_check', conf_runtime_enable_hijack_api_check)
runtime_conf.set('conf_runtime_enable_trace', conf_runtime_enable_trace)
runtime_conf.set('conf_runtime_enable_memory_trace', conf_runtime_enable_memory_trace)
configure_file(input : 'runtime_configs.h.in', output : 'runtime_configs.h', configuration : runtime_conf)


Expand Down
3 changes: 3 additions & 0 deletions pos/include/runtime_configs.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,6 @@

// whether to enable runtime trace of statistics
#define POS_CONF_RUNTIME_EnableTrace @conf_runtime_enable_trace@

// whether to collect runtime memory trace of statistics
#define POS_CONF_RUNTIME_EnableMemoryTrace @conf_runtime_enable_memory_trace@
11 changes: 5 additions & 6 deletions pos/include/worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ class POSWorker {

// mark restoring phrase
enum pos_worker_restore_phraseid_t : uint8_t {
kPOS_WorkRestorePhrase_Recomputation_Init = 0,
kPOS_WorkRestorePhrase_Init = 0,
kPOS_WorkRestorePhrase_Recomputation,
kPOS_WorkRestorePhrase_Unexecution,
kPOS_WorkRestorePhrase_Normal
Expand Down Expand Up @@ -446,11 +446,10 @@ class POSWorker {

enum metrics_sequence_type_t : uint8_t {
__SEQUENCE_BASE__= 0,
KERNEL_out_handle_state_size,
KERNEL_inout_handle_state_size,
CKPT_nb_cow_handles,
CKPT_nb_cow_stateful_handles,
CKPT_nb_cow_size,
#if POS_CONF_RUNTIME_EnableMemoryTrace
KERNEL_write_state_size,
CKPT_cow_size,
#endif
// note: here could have a crazy metric to collect each kernel's duration
RESTORE_ondemand_restore_handle_nb,
RESTORE_ondemand_restore_handle_with_state_nb,
Expand Down
Loading

0 comments on commit 413c7e1

Please sign in to comment.