Skip to content

Commit

Permalink
Update JSON utilization API to support CPU-only, GPU-only and multi-a…
Browse files Browse the repository at this point in the history
…rchitecture systems (#525)
  • Loading branch information
tpatki authored Mar 11, 2024
1 parent b449ef2 commit d3f847b
Show file tree
Hide file tree
Showing 12 changed files with 53 additions and 202 deletions.
7 changes: 3 additions & 4 deletions src/docs/sphinx/VariorumAPI.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,10 @@ The API to obtain node utilization has the following format. It takes a string
(``char**``) by reference as input, and populates this string with a JSON object
with total CPU, system CPU, user CPU, total memory, and GPU (when available)
utilizations. It reports the utilization of each available GPU. GPU utilization
is accomplished using the ``int variorum_get_gpu_utilization_json(char
**get_gpu_util_obj_str)`` function. The total memory utilization is computed
is obtained using the NVML and RSMI APIs. The total memory utilization is computed
using ``/proc/meminfo``, and CPU utilizations is computed using ``/proc/stat``.

The ``variorum_get_node_utilization_json(char **get_util_obj_str)`` function
The ``variorum_get_utilization_json(char **get_util_obj_str)`` function
returns a string type nested JSON object. An example is provided below:

.. code::
Expand All @@ -150,7 +149,7 @@ returns a string type nested JSON object. An example is provided below:
The ``*`` here refers to socket ID, and the ``#`` refers to GPU ID.

The ``variorum_get_node_utilization_json(char **get_util_obj_str)`` function
The ``variorum_get_utilization_json(char **get_util_obj_str)`` function
returns a string type nested JSON object. An example is provided below:

.. code::
Expand Down
3 changes: 1 addition & 2 deletions src/docs/sphinx/api/json_support_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,5 @@ Defined in ``variorum/variorum.h``.

.. doxygenfunction:: variorum_get_frequency_json

.. doxygenfunction:: variorum_get_node_utilization_json
.. doxygenfunction:: variorum_get_utilization_json

.. doxygenfunction:: variorum_get_gpu_utilization_json
3 changes: 1 addition & 2 deletions src/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@ set(BASIC_EXAMPLES
variorum-disable-turbo-example
variorum-enable-turbo-example
variorum-get-frequency-json-example
variorum-get-gpu-utilization-json-example
variorum-get-node-power-domain-info-json-example
variorum-get-power-json-example
variorum-get-thermals-json-example
variorum-get-node-utilization-json-example
variorum-get-utilization-json-example
variorum-get-topology-info-example
variorum-integration-using-json-example
variorum-monitoring-to-file-example
Expand Down
88 changes: 0 additions & 88 deletions src/examples/variorum-get-node-utilization-json-example.c

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ int main(int argc, char **argv)
return -1;
}
}
ret = variorum_get_gpu_utilization_json(&s);
ret = variorum_get_utilization_json(&s);
if (ret != 0)
{
printf("First run: JSON get node utilization failed!\n");
Expand All @@ -69,7 +69,7 @@ int main(int argc, char **argv)
x += do_work(i);
}
printf("Final result: %f\n", x);
ret = variorum_get_gpu_utilization_json(&s);
ret = variorum_get_utilization_json(&s);
if (ret != 0)
{
printf("Second run: JSON get node utilization failed!\n");
Expand Down
3 changes: 1 addition & 2 deletions src/variorum/AMD_GPU/amd_gpu_power_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,6 @@ void get_gpu_utilization_data_json(int chipid, int total_sockets,
rsmi_status_t ret;
uint32_t num_devices;
int gpus_per_socket;
int d = 0;
char socket_id[12];
char hostname[1024];
char device_id[12];
Expand Down Expand Up @@ -869,7 +868,7 @@ void get_gpu_utilization_data_json(int chipid, int total_sockets,
getenv("HOSTNAME"), __FILE__, __FUNCTION__,
__LINE__);
}
snprintf(device_id, 12, "GPU%d_util%%", d);
snprintf(device_id, 12, "GPU%d_util%%", i);
json_object_set_new(socket_obj, device_id, json_integer(utilpercent));
}

Expand Down
2 changes: 1 addition & 1 deletion src/variorum/AMD_GPU/config_amd_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ int set_amd_gpu_func_ptrs(int idx)
amd_gpu_instinct_get_gpu_utilization;
g_platform[idx].variorum_get_thermals_json = amd_gpu_instinct_get_thermals_json;
g_platform[idx].variorum_get_frequency_json = amd_gpu_instinct_get_clocks_json;
g_platform[idx].variorum_get_gpu_utilization_json =
g_platform[idx].variorum_get_utilization_json =
amd_gpu_instinct_get_gpu_utilization_json;
/* Initialize control interfaces */
g_platform[idx].variorum_cap_each_gpu_power_limit =
Expand Down
2 changes: 1 addition & 1 deletion src/variorum/Nvidia_GPU/config_nvidia.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int set_nvidia_func_ptrs(int idx)
g_platform[idx].variorum_print_gpu_utilization = volta_get_gpu_utilization;
g_platform[idx].variorum_get_thermals_json = volta_get_thermals_json;
g_platform[idx].variorum_get_frequency_json = volta_get_clocks_json;
g_platform[idx].variorum_get_gpu_utilization_json =
g_platform[idx].variorum_get_utilization_json =
volta_get_gpu_utilization_json;
/* Initialize control interfaces */
g_platform[idx].variorum_cap_each_gpu_power_limit =
Expand Down
2 changes: 1 addition & 1 deletion src/variorum/config_architecture.c
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ void variorum_init_func_ptrs()
g_platform[i].variorum_print_turbo = NULL;
g_platform[i].variorum_poll_power = NULL;
g_platform[i].variorum_print_gpu_utilization = NULL;
g_platform[i].variorum_get_gpu_utilization_json = NULL;
g_platform[i].variorum_get_utilization_json = NULL;
g_platform[i].variorum_monitoring = NULL;
g_platform[i].variorum_get_power_json = NULL;
g_platform[i].variorum_get_node_power_domain_info_json = NULL;
Expand Down
4 changes: 2 additions & 2 deletions src/variorum/config_architecture.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,10 @@ struct platform
/// @return Error code.
int (*variorum_print_gpu_utilization)(int long_ver);

/// @brief Function pointer to get JSON object for GPU utilization
/// @brief Function pointer to get JSON object for utilization
///
/// @return Error code.
int (*variorum_get_gpu_utilization_json)(char **get_gpu_util_obj_str);
int (*variorum_get_utilization_json)(char **get_util_obj_str);

/// @brief Function pointer to get JSON object for node power data.
///
Expand Down
97 changes: 39 additions & 58 deletions src/variorum/variorum.c
Original file line number Diff line number Diff line change
Expand Up @@ -1090,7 +1090,7 @@ int variorum_get_power_json(char **get_power_obj_str)
return err;
}

int variorum_get_node_utilization_json(char **get_util_obj_str)
int variorum_get_utilization_json(char **get_util_obj_str)
{
int err = 0;
err = variorum_enter(__FILE__, __FUNCTION__, __LINE__);
Expand All @@ -1099,20 +1099,12 @@ int variorum_get_node_utilization_json(char **get_util_obj_str)
return -1;
}

err = variorum_exit(__FILE__, __FUNCTION__, __LINE__);
if (err)
{
return -1;
}

char hostname[1024];
struct timeval tv;
uint64_t ts;
char *gpu_util_str = NULL;
gethostname(hostname, 1024);
gettimeofday(&tv, NULL);
ts = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec;
int ret;
char str[100];
const char d[2] = " ";
char *token, *s, *p;
Expand All @@ -1136,33 +1128,63 @@ int variorum_get_node_utilization_json(char **get_util_obj_str)
uint64_t mem_free = 0;
uint64_t sys_time = 0;
int strcp;
int idx = -1;

json_t *get_util_obj = NULL;
json_t *get_cpu_util_obj = NULL;
json_t *get_timestamp_obj = NULL;
json_t *cpu_util_obj = NULL;

// Look for a GPU build and get an ID.
for (idx = 0; idx < P_NUM_PLATFORMS; idx++)
{
#ifdef VARIORUM_WITH_INTEL_GPU
idx = P_INTEL_GPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_NVIDIA_GPU
idx = P_NVIDIA_GPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_AMD_GPU
idx = P_AMD_GPU_IDX;
break;
#endif
}

// If we have a GPU build, obtain the GPU object first.
#if defined(VARIORUM_WITH_NVIDIA_GPU) || defined(VARIORUM_WITH_AMD_GPU) || defined(VARIORUM_WITH_INTEL_GPU)
int ret;
char *gpu_util_str = NULL;
// get gpu utilization
ret = variorum_get_gpu_utilization_json(&gpu_util_str);
ret = g_platform[idx].variorum_get_utilization_json(&gpu_util_str);
if (ret != 0)
{
printf("JSON get gpu utilization failed. Exiting.\n");
free(gpu_util_str);
return -1;
}

/* Load the string as a JSON object using Jansson */
json_t *get_util_obj = json_loads(gpu_util_str, JSON_DECODE_ANY, NULL);
/* Load the existing GPU string as a JSON object using Jansson */
get_util_obj = json_loads(gpu_util_str, JSON_DECODE_ANY, NULL);
get_cpu_util_obj = json_object_get(get_util_obj, hostname);
get_timestamp_obj = json_object_get(get_cpu_util_obj, "timestamp");
cpu_util_obj = json_object_get(get_cpu_util_obj, "CPU");
#endif

json_t *get_cpu_util_obj = json_object_get(get_util_obj, hostname);
if (get_cpu_util_obj == NULL)
//CPU-only build will have this object as NULL.
if (get_util_obj == NULL)
{
get_util_obj = json_object();
get_cpu_util_obj = json_object();
json_object_set_new(get_util_obj, hostname, get_cpu_util_obj);
}

json_t *get_timestamp_obj = json_object_get(get_util_obj, "timestamp");
if (get_timestamp_obj == NULL)
{
json_object_set_new(get_cpu_util_obj, "timestamp", json_integer(ts));
}

json_t *cpu_util_obj = json_object_get(get_cpu_util_obj, "CPU");
if (cpu_util_obj == NULL)
{
cpu_util_obj = json_object();
Expand Down Expand Up @@ -1237,6 +1259,7 @@ int variorum_get_node_utilization_json(char **get_util_obj_str)
last_sum = sum;
last_sys_time = sys_time;
last_idle = sum_idle;

json_object_set_new(cpu_util_obj, "total_util%", json_real(cpu_util));
json_object_set_new(cpu_util_obj, "user_util%", json_real(user_util));
json_object_set_new(cpu_util_obj, "system_util%", json_real(sys_util));
Expand Down Expand Up @@ -1293,48 +1316,6 @@ int variorum_get_node_utilization_json(char **get_util_obj_str)
*get_util_obj_str = json_dumps(get_util_obj, JSON_INDENT(4));
json_decref(get_util_obj);
state = 1;
return 0;
}

int variorum_get_gpu_utilization_json(char **get_gpu_util_obj_str)
{
int err = 0;
int i;
err = variorum_enter(__FILE__, __FUNCTION__, __LINE__);
if (err)
{
return -1;
}

for (i = 0; i < P_NUM_PLATFORMS; i++)
{
#ifdef VARIORUM_WITH_INTEL_GPU
i = P_INTEL_GPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_NVIDIA_GPU
i = P_NVIDIA_GPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_AMD_GPU
i = P_AMD_GPU_IDX;
break;
#endif
}

if (g_platform[i].variorum_get_gpu_utilization_json == NULL)
{
variorum_error_handler("Feature not yet implemented or is not supported",
VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED,
getenv("HOSTNAME"), __FILE__,
__FUNCTION__, __LINE__);
return -1;
}
err = g_platform[i].variorum_get_gpu_utilization_json(get_gpu_util_obj_str);
if (err)
{
return -1;
}

err = variorum_exit(__FILE__, __FUNCTION__, __LINE__);
if (err)
Expand Down
Loading

0 comments on commit d3f847b

Please sign in to comment.