-
Notifications
You must be signed in to change notification settings - Fork 436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCP/PROTO: UCX_PROTO_INFO=used option #10395
base: master
Are you sure you want to change the base?
Changes from 1 commit
2723acc
a1ead26
69a81e0
c22e3fb
3ea1f16
4212b9d
d7ecd6a
a296bd3
807fe74
f90bea9
ef0e249
3ef05f8
d1c33ca
778101f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,7 @@ struct ucp_proto_perf_node { | |
|
||
/* Protocol information table */ | ||
typedef struct { | ||
char counter_str[32]; | ||
char range_str[32]; | ||
char desc[UCP_PROTO_DESC_STR_MAX]; | ||
char config[UCP_PROTO_CONFIG_STR_MAX]; | ||
|
@@ -153,42 +154,69 @@ static void ucp_proto_table_row_separator(ucs_string_buffer_t *strb, | |
} | ||
|
||
static int ucp_proto_debug_is_info_enabled(ucp_context_h context, | ||
const char *select_param_str) | ||
const char *select_param_str, | ||
int show_used) | ||
{ | ||
const char *proto_info_config = context->config.ext.proto_info; | ||
int bool_value; | ||
|
||
if (show_used) { | ||
return !strcmp(proto_info_config, "used"); | ||
} | ||
|
||
if (ucs_config_sscanf_bool(proto_info_config, &bool_value, NULL)) { | ||
return bool_value; | ||
} | ||
|
||
return fnmatch(proto_info_config, select_param_str, FNM_CASEFOLD) == 0; | ||
} | ||
|
||
static inline size_t | ||
ucp_proto_select_elem_selected_count(const ucp_proto_select_elem_t *select_elem) | ||
{ | ||
#ifdef ENABLE_STATS | ||
const ucp_proto_threshold_elem_t *thresh_elem = select_elem->thresholds; | ||
size_t total_selected_count = 0; | ||
|
||
do { | ||
total_selected_count += thresh_elem->proto_config.selected_count; | ||
} while ((thresh_elem++)->max_msg_length < SIZE_MAX); | ||
|
||
return total_selected_count; | ||
#else | ||
return 0; | ||
#endif | ||
} | ||
|
||
static void | ||
ucp_proto_select_elem_info(ucp_worker_h worker, | ||
ucp_worker_cfg_index_t ep_cfg_index, | ||
ucp_worker_cfg_index_t rkey_cfg_index, | ||
const ucp_proto_select_param_t *select_param, | ||
ucp_proto_select_elem_t *select_elem, int show_all, | ||
ucs_string_buffer_t *strb) | ||
int show_used, ucs_string_buffer_t *strb) | ||
{ | ||
UCS_STRING_BUFFER_ONSTACK(ep_cfg_strb, UCP_PROTO_CONFIG_STR_MAX); | ||
UCS_STRING_BUFFER_ONSTACK(sel_param_strb, UCP_PROTO_CONFIG_STR_MAX); | ||
static const char *info_row_fmt = "| %*s | %-*s | %-*s |\n"; | ||
static const char *info_row_fmt = "| %s%*s | %-*s | %-*s |\n"; | ||
ucp_proto_info_table_t table; | ||
int hdr_col_width[2], col_width[3]; | ||
ucp_proto_query_attr_t proto_attr; | ||
ucp_proto_info_row_t *row_elem; | ||
size_t range_start, range_end; | ||
int proto_valid; | ||
|
||
if (show_used && (0 == ucp_proto_select_elem_selected_count(select_elem))) { | ||
return; | ||
} | ||
|
||
ucp_proto_select_param_dump(worker, ep_cfg_index, rkey_cfg_index, | ||
select_param, ucp_operation_descs, &ep_cfg_strb, | ||
&sel_param_strb); | ||
if (!show_all && | ||
!ucp_proto_debug_is_info_enabled( | ||
worker->context, ucs_string_buffer_cstr(&sel_param_strb))) { | ||
worker->context, ucs_string_buffer_cstr(&sel_param_strb), | ||
show_used)) { | ||
return; | ||
} | ||
|
||
|
@@ -219,7 +247,16 @@ ucp_proto_select_elem_info(ucp_worker_h worker, | |
ucs_memunits_range_str(range_start, range_end, row_elem->range_str, | ||
sizeof(row_elem->range_str)); | ||
|
||
col_width[0] = ucs_max(col_width[0], strlen(row_elem->range_str)); | ||
if (show_used && (proto_attr.selected_count > 0)) { | ||
ucs_snprintf_safe(row_elem->counter_str, | ||
sizeof(row_elem->counter_str), "%zu ", | ||
proto_attr.selected_count); | ||
} else { | ||
row_elem->counter_str[0] = '\0'; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe print There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's easily doable, I thought about this, but IMO having zeroes in the output does not improve readability:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. imo it is better, than showing the counter only for some rows, but not others There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, will be done |
||
} | ||
|
||
col_width[0] = ucs_max(col_width[0], strlen(row_elem->counter_str) + | ||
strlen(row_elem->range_str)); | ||
col_width[1] = ucs_max(col_width[1], strlen(row_elem->desc)); | ||
col_width[2] = ucs_max(col_width[2], strlen(row_elem->config)); | ||
} while (range_end != SIZE_MAX); | ||
|
@@ -241,7 +278,8 @@ ucp_proto_select_elem_info(ucp_worker_h worker, | |
/* Print contents */ | ||
ucp_proto_table_row_separator(strb, col_width, 3); | ||
ucs_array_for_each(row_elem, &table) { | ||
ucs_string_buffer_appendf(strb, info_row_fmt, col_width[0], | ||
ucs_string_buffer_appendf(strb, info_row_fmt, row_elem->counter_str, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the meaning of these numbers will not be obvious to users. Please add the value to a separate column with the name. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I followed the existing design - as you can see there are no really named columns, e.g. for message range. The column name is actually just the config name. So if I add a new named column just for selections count - that would look ridiculous IMO There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps, we should add a row with column names: size range / protocol / resources / use count. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure about the added value of this change. On the other hand - it would also increase the output size - and this is exactly the issue I'm trying to solve with this commit. |
||
col_width[0] - strlen(row_elem->counter_str), | ||
row_elem->range_str, col_width[1], | ||
row_elem->desc, col_width[2], | ||
row_elem->config); | ||
|
@@ -262,7 +300,7 @@ void ucp_proto_select_info(ucp_worker_h worker, | |
|
||
kh_foreach(proto_select->hash, key.u64, select_elem, | ||
ucp_proto_select_elem_info(worker, ep_cfg_index, rkey_cfg_index, | ||
&key.param, &select_elem, show_all, | ||
&key.param, &select_elem, show_all, 0, | ||
strb); | ||
ucs_string_buffer_appendf(strb, "\n")) | ||
} | ||
|
@@ -964,7 +1002,7 @@ ucp_proto_select_write_info(ucp_worker_h worker, | |
ucp_operation_names, &ep_cfg_strb, | ||
&sel_param_strb); | ||
if (!ucp_proto_debug_is_info_enabled( | ||
worker->context, ucs_string_buffer_cstr(&sel_param_strb))) { | ||
worker->context, ucs_string_buffer_cstr(&sel_param_strb), 0)) { | ||
goto out; | ||
} | ||
|
||
|
@@ -1029,17 +1067,19 @@ ucp_proto_select_write_info(ucp_worker_h worker, | |
} | ||
|
||
void ucp_proto_select_elem_trace(ucp_worker_h worker, | ||
ucp_worker_cfg_index_t ep_cfg_index, | ||
ucp_worker_cfg_index_t rkey_cfg_index, | ||
const ucp_proto_select_param_t *select_param, | ||
ucp_proto_select_elem_t *select_elem) | ||
ucp_proto_select_elem_t *select_elem, | ||
int show_used) | ||
{ | ||
ucs_string_buffer_t strb = UCS_STRING_BUFFER_INITIALIZER; | ||
const ucp_proto_config_t *proto_config = &select_elem->thresholds[0].proto_config; | ||
ucp_worker_cfg_index_t ep_cfg_index = proto_config->ep_cfg_index; | ||
ucp_worker_cfg_index_t rkey_cfg_index = proto_config->rkey_cfg_index; | ||
ucs_string_buffer_t strb = UCS_STRING_BUFFER_INITIALIZER; | ||
char *line; | ||
|
||
/* Print human-readable protocol selection table to the log */ | ||
ucp_proto_select_elem_info(worker, ep_cfg_index, rkey_cfg_index, | ||
select_param, select_elem, 0, &strb); | ||
select_param, select_elem, 0, show_used, &strb); | ||
ucs_string_buffer_for_each_token(line, &strb, "\n") { | ||
ucs_log_print_compact(line); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems we display the data only during working destroy, but what if we have a long running application and we want to show the info without waiting for app exit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This scenario is not really covered by the initial impl, as it does not seem to be a mainstream use case.
Here are 2 ideas what we can do:
UCX_PRORO_INFO=y
optiontimerfd_create
andtimerfd_settime
, that would triggerucp_worker_trace_configs
.Personally I believe it's not really worth the effort. But if you think it's useful, can be done maybe in the next PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we print the info after the count reaches a certain threshold?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In short - I think that may not be always reliable, and requires operations on fast-path
Currently the idea is that we support this feature in 2 modes:
ENABLE_DEBUG_DATA
) - when we get precise statistics. Requires adding increment operation to the fast path.Btw, what's your opinion on that? Should have have these 2 modes of operation, or we just enable this feature always?
Basically in release mode the counter is never incremented above 1, so we cannot rely on this.
And in debug mode this would require to introduce one more if branch to the fast-path.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO we need to try making the feature available also in release mode without adding overhead when it is not enabled
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I implemented it with absolutely zero overhead when the feature is disabled.
It required some additions to the logger, hopefully you like it.
If not, we can remove logger event listener and have one extra if branch in the fast-path.
Please let me know
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks to @brminich , I removed this logger stuff, and found the right place for the increment.
Now it's really zero overhead when feature is disabled