-
Notifications
You must be signed in to change notification settings - Fork 436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCP/PROTO: UCX_PROTO_INFO=used option #10395
base: master
Are you sure you want to change the base?
Changes from 3 commits
2723acc
a1ead26
69a81e0
c22e3fb
3ea1f16
4212b9d
d7ecd6a
a296bd3
807fe74
f90bea9
ef0e249
3ef05f8
d1c33ca
778101f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,7 @@ struct ucp_proto_perf_node { | |
|
||
/* Protocol information table */ | ||
typedef struct { | ||
char counter_str[32]; | ||
char range_str[32]; | ||
char desc[UCP_PROTO_DESC_STR_MAX]; | ||
char config[UCP_PROTO_CONFIG_STR_MAX]; | ||
|
@@ -153,42 +154,69 @@ static void ucp_proto_table_row_separator(ucs_string_buffer_t *strb, | |
} | ||
|
||
static int ucp_proto_debug_is_info_enabled(ucp_context_h context, | ||
const char *select_param_str) | ||
const char *select_param_str, | ||
int show_used) | ||
{ | ||
const char *proto_info_config = context->config.ext.proto_info; | ||
int bool_value; | ||
|
||
if (show_used) { | ||
return !strcmp(proto_info_config, "used"); | ||
} | ||
|
||
if (ucs_config_sscanf_bool(proto_info_config, &bool_value, NULL)) { | ||
return bool_value; | ||
} | ||
|
||
return fnmatch(proto_info_config, select_param_str, FNM_CASEFOLD) == 0; | ||
} | ||
|
||
static inline unsigned | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe make it boolean and breal immideately once some selection is found? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
ucp_proto_select_elem_selections(const ucp_proto_select_elem_t *select_elem) | ||
{ | ||
#ifdef ENABLE_DEBUG_DATA | ||
const ucp_proto_threshold_elem_t *thresh_elem = select_elem->thresholds; | ||
unsigned total_selections = 0; | ||
|
||
do { | ||
total_selections += thresh_elem->proto_config.selections; | ||
} while ((thresh_elem++)->max_msg_length < SIZE_MAX); | ||
|
||
return total_selections; | ||
#else | ||
return 0; | ||
#endif | ||
} | ||
|
||
static void | ||
ucp_proto_select_elem_info(ucp_worker_h worker, | ||
ucp_worker_cfg_index_t ep_cfg_index, | ||
ucp_worker_cfg_index_t rkey_cfg_index, | ||
const ucp_proto_select_param_t *select_param, | ||
ucp_proto_select_elem_t *select_elem, int show_all, | ||
ucs_string_buffer_t *strb) | ||
int show_used, ucs_string_buffer_t *strb) | ||
{ | ||
UCS_STRING_BUFFER_ONSTACK(ep_cfg_strb, UCP_PROTO_CONFIG_STR_MAX); | ||
UCS_STRING_BUFFER_ONSTACK(sel_param_strb, UCP_PROTO_CONFIG_STR_MAX); | ||
static const char *info_row_fmt = "| %*s | %-*s | %-*s |\n"; | ||
static const char *info_row_fmt = "| %s%*s | %-*s | %-*s |\n"; | ||
ucp_proto_info_table_t table; | ||
int hdr_col_width[2], col_width[3]; | ||
ucp_proto_query_attr_t proto_attr; | ||
ucp_proto_info_row_t *row_elem; | ||
size_t range_start, range_end; | ||
int proto_valid; | ||
|
||
if (show_used && (0 == ucp_proto_select_elem_selections(select_elem))) { | ||
return; | ||
} | ||
|
||
ucp_proto_select_param_dump(worker, ep_cfg_index, rkey_cfg_index, | ||
select_param, ucp_operation_descs, &ep_cfg_strb, | ||
&sel_param_strb); | ||
if (!show_all && | ||
!ucp_proto_debug_is_info_enabled( | ||
worker->context, ucs_string_buffer_cstr(&sel_param_strb))) { | ||
worker->context, ucs_string_buffer_cstr(&sel_param_strb), | ||
show_used)) { | ||
return; | ||
} | ||
|
||
|
@@ -219,7 +247,16 @@ ucp_proto_select_elem_info(ucp_worker_h worker, | |
ucs_memunits_range_str(range_start, range_end, row_elem->range_str, | ||
sizeof(row_elem->range_str)); | ||
|
||
col_width[0] = ucs_max(col_width[0], strlen(row_elem->range_str)); | ||
if (show_used && (proto_attr.selections > 0)) { | ||
ucs_snprintf_safe(row_elem->counter_str, | ||
sizeof(row_elem->counter_str), "%u ", | ||
proto_attr.selections); | ||
} else { | ||
row_elem->counter_str[0] = '\0'; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe print There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's easily doable, I thought about this, but IMO having zeroes in the output does not improve readability:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. imo it is better, than showing the counter only for some rows, but not others There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, will be done |
||
} | ||
|
||
col_width[0] = ucs_max(col_width[0], strlen(row_elem->counter_str) + | ||
strlen(row_elem->range_str)); | ||
col_width[1] = ucs_max(col_width[1], strlen(row_elem->desc)); | ||
col_width[2] = ucs_max(col_width[2], strlen(row_elem->config)); | ||
} while (range_end != SIZE_MAX); | ||
|
@@ -241,7 +278,8 @@ ucp_proto_select_elem_info(ucp_worker_h worker, | |
/* Print contents */ | ||
ucp_proto_table_row_separator(strb, col_width, 3); | ||
ucs_array_for_each(row_elem, &table) { | ||
ucs_string_buffer_appendf(strb, info_row_fmt, col_width[0], | ||
ucs_string_buffer_appendf(strb, info_row_fmt, row_elem->counter_str, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the meaning of these numbers will not be obvious to users. Please add the value to a separate column with the name. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I followed the existing design - as you can see there are no really named columns, e.g. for message range. The column name is actually just the config name. So if I add a new named column just for selections count - that would look ridiculous IMO There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps, we should add a row with column names: size range / protocol / resources / use count. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure about the added value of this change. On the other hand - it would also increase the output size - and this is exactly the issue I'm trying to solve with this commit. |
||
col_width[0] - strlen(row_elem->counter_str), | ||
row_elem->range_str, col_width[1], | ||
row_elem->desc, col_width[2], | ||
row_elem->config); | ||
|
@@ -262,7 +300,7 @@ void ucp_proto_select_info(ucp_worker_h worker, | |
|
||
kh_foreach(proto_select->hash, key.u64, select_elem, | ||
ucp_proto_select_elem_info(worker, ep_cfg_index, rkey_cfg_index, | ||
&key.param, &select_elem, show_all, | ||
&key.param, &select_elem, show_all, 0, | ||
strb); | ||
ucs_string_buffer_appendf(strb, "\n")) | ||
} | ||
|
@@ -964,7 +1002,7 @@ ucp_proto_select_write_info(ucp_worker_h worker, | |
ucp_operation_names, &ep_cfg_strb, | ||
&sel_param_strb); | ||
if (!ucp_proto_debug_is_info_enabled( | ||
worker->context, ucs_string_buffer_cstr(&sel_param_strb))) { | ||
worker->context, ucs_string_buffer_cstr(&sel_param_strb), 0)) { | ||
goto out; | ||
} | ||
|
||
|
@@ -1029,17 +1067,19 @@ ucp_proto_select_write_info(ucp_worker_h worker, | |
} | ||
|
||
void ucp_proto_select_elem_trace(ucp_worker_h worker, | ||
ucp_worker_cfg_index_t ep_cfg_index, | ||
ucp_worker_cfg_index_t rkey_cfg_index, | ||
const ucp_proto_select_param_t *select_param, | ||
ucp_proto_select_elem_t *select_elem) | ||
ucp_proto_select_elem_t *select_elem, | ||
int show_used) | ||
{ | ||
ucs_string_buffer_t strb = UCS_STRING_BUFFER_INITIALIZER; | ||
const ucp_proto_config_t *proto_config = &select_elem->thresholds[0].proto_config; | ||
ucp_worker_cfg_index_t ep_cfg_index = proto_config->ep_cfg_index; | ||
ucp_worker_cfg_index_t rkey_cfg_index = proto_config->rkey_cfg_index; | ||
ucs_string_buffer_t strb = UCS_STRING_BUFFER_INITIALIZER; | ||
char *line; | ||
|
||
/* Print human-readable protocol selection table to the log */ | ||
ucp_proto_select_elem_info(worker, ep_cfg_index, rkey_cfg_index, | ||
select_param, select_elem, 0, &strb); | ||
select_param, select_elem, 0, show_used, &strb); | ||
ucs_string_buffer_for_each_token(line, &strb, "\n") { | ||
ucs_log_print_compact(line); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,6 +81,7 @@ ucp_proto_select_lookup(ucp_worker_h worker, ucp_proto_select_t *proto_select, | |
size_t msg_length) | ||
{ | ||
const ucp_proto_select_elem_t *select_elem; | ||
const ucp_proto_threshold_elem_t *thresh; | ||
ucp_proto_select_key_t key; | ||
khiter_t khiter; | ||
|
||
|
@@ -108,7 +109,11 @@ ucp_proto_select_lookup(ucp_worker_h worker, ucp_proto_select_t *proto_select, | |
proto_select->cache.value = select_elem; | ||
} | ||
|
||
return ucp_proto_select_thresholds_search(select_elem, msg_length); | ||
thresh = ucp_proto_select_thresholds_search(select_elem, msg_length); | ||
#ifdef ENABLE_DEBUG_DATA | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do you need it? Does it mean that used option will not work in the release mode? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, it won't work in release mode There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. imo, we need it in release mode. We ask customers to provide this output quite often. Initial protocol selection is not a fast-path, or am i missing something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feature is not about initial protocol selection, it counts all the protocol selections during send execution, so in fast-path. So we add an increment operation to the fast-path, which is not a big deal There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i see would it be possible to to increase only on slow path? Then you would not need a counter, but rather a boolean indicating whether the protocol was selected at least once There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, it is possible, but it would reduce the added value IMO. But look, maybe we can use a combined approach:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, unfortunately it's not that easy with Release mode.. This is a problem, because we don't want to take into account this "probing" selection. If we count it, then we basically display all the protocols, not actually selected ones. If we don't count it - then we can't detect selection of short protocols, because
But I don't see a good solution for Release mode... |
||
++((ucp_proto_threshold_elem_t *)thresh)->proto_config.selections; | ||
#endif | ||
return thresh; | ||
} | ||
|
||
/* | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems we display the data only during working destroy, but what if we have a long running application and we want to show the info without waiting for app exit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This scenario is not really covered by the initial impl, as it does not seem to be a mainstream use case.
Here are 2 ideas what we can do:
UCX_PRORO_INFO=y
optiontimerfd_create
andtimerfd_settime
, that would triggerucp_worker_trace_configs
.Personally I believe it's not really worth the effort. But if you think it's useful, can be done maybe in the next PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we print the info after the count reaches a certain threshold?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In short - I think that may not be always reliable, and requires operations on fast-path
Currently the idea is that we support this feature in 2 modes:
ENABLE_DEBUG_DATA
) - when we get precise statistics. Requires adding increment operation to the fast path.Btw, what's your opinion on that? Should have have these 2 modes of operation, or we just enable this feature always?
Basically in release mode the counter is never incremented above 1, so we cannot rely on this.
And in debug mode this would require to introduce one more if branch to the fast-path.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO we need to try making the feature available also in release mode without adding overhead when it is not enabled
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I implemented it with absolutely zero overhead when the feature is disabled.
It required some additions to the logger, hopefully you like it.
If not, we can remove logger event listener and have one extra if branch in the fast-path.
Please let me know
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks to @brminich , I removed this logger stuff, and found the right place for the increment.
Now it's really zero overhead when feature is disabled