From 905d6be36f9455b408fa866aa92eccf17655d7ee Mon Sep 17 00:00:00 2001 From: Brice Videau Date: Mon, 13 May 2024 15:46:40 -0500 Subject: [PATCH 1/7] Make only local master do energy profiling. --- xprof/xprof.rb.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 16289f41..285601d2 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -386,7 +386,7 @@ def env_tracers end # Sample - if sampling? + if sampling? && mpi_local_master> LOGGER.debug('Sampling Enabled') h['LTTNG_UST_SAMPLING'] = 1 h['LTTNG_UST_SAMPLING_ENERGY'] = 1 From 80be1816eee760241f0b1eb1bef8e25b4cfc44f8 Mon Sep 17 00:00:00 2001 From: Brice Videau Date: Wed, 15 May 2024 11:44:04 -0500 Subject: [PATCH 2/7] Use ZES to query devices in order to get around affinity masks. --- ze/tracer_ze_helpers.include.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 292e993f..9d0e71db 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -794,7 +794,7 @@ static int _sampling_pwr_initialized = 0; static int _sampling_engines_initialized = 0; // Static handles to stay throughout the execution static ze_driver_handle_t* _sampling_hDrivers = NULL; -static ze_device_handle_t** _sampling_hDevices = NULL; +static zes_device_handle_t** _sampling_hDevices = NULL; static zes_freq_handle_t*** _sampling_hFrequencies = NULL; static zes_pwr_handle_t*** _sampling_hPowers = NULL; static zes_engine_handle_t*** _sampling_engineHandles = NULL; @@ -914,13 +914,6 @@ static int initializeHandles() { fprintf(stderr,"ZES_ENABLE_SYSMAN needs to be set!\n"); return -1; } -#ifdef CALL_ZEINIT - res = zeInit(ZE_INIT_FLAG_GPU_ONLY); - if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("zeInit", res); - return -1; - } -#endif // Query driver _sampling_driverCount = 0; @@ -940,27 +933,30 @@ static int initializeHandles() { _sampling_hDevices = (ze_device_handle_t**) calloc(_sampling_driverCount, sizeof(ze_device_handle_t*)); // Query device count for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { - res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount[driverIdx] == 0) { fprintf(stderr, "ERROR: No device found!\n"); - _ZE_ERROR_MSG("1st ZE_DEVICE_GET_PTR", res); + _ZE_ERROR_MSG("1st ZES_DEVICE_GET_PTR", res); return -1; } _sampling_hDevices[driverIdx] = (ze_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t)); - res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DEVICE_GET_PTR", res); + _ZE_ERROR_MSG("2nd ZES_DEVICE_GET_PTR", res); free(_sampling_hDevices[driverIdx]); return -1; } //Get no sub-devices _sampling_subDeviceCount[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { - res = ZE_DEVICE_GET_SUB_DEVICES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_subDeviceCount[driverIdx][deviceIdx], NULL); + zes_device_properties_t deviceProperties = {0}; + deviceProperties.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES; + res = ZES_DEVICE_GET_PROPERTIES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &deviceProperties); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res); + _ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res); _sampling_subDeviceCount[driverIdx][deviceIdx] = 0; - } + } else + _sampling_subDeviceCount[driverIdx][deviceIdx] = deviceProperties.numSubdevices; if (_sampling_subDeviceCount[driverIdx][deviceIdx] == 0) { _sampling_subDeviceCount[driverIdx][deviceIdx] = 1; } From f700a26c64cd2ed32abf0d4be27d8c7e7b97d9a8 Mon Sep 17 00:00:00 2001 From: Brice Videau Date: Wed, 15 May 2024 11:53:52 -0500 Subject: [PATCH 3/7] Use ZES for drivers as well. --- ze/tracer_ze_helpers.include.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 9d0e71db..31036433 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -793,7 +793,7 @@ static int _sampling_freq_initialized = 0; static int _sampling_pwr_initialized = 0; static int _sampling_engines_initialized = 0; // Static handles to stay throughout the execution -static ze_driver_handle_t* _sampling_hDrivers = NULL; +static zes_driver_handle_t* _sampling_hDrivers = NULL; static zes_device_handle_t** _sampling_hDevices = NULL; static zes_freq_handle_t*** _sampling_hFrequencies = NULL; static zes_pwr_handle_t*** _sampling_hPowers = NULL; @@ -917,20 +917,20 @@ static int initializeHandles() { // Query driver _sampling_driverCount = 0; - res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, NULL); + res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, NULL); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("1st ZE_DRIVER_GET_PTR", res); + _ZE_ERROR_MSG("1st ZES_DRIVER_GET_PTR", res); return -1; } - _sampling_hDrivers = (ze_driver_handle_t*) calloc(_sampling_driverCount, sizeof(ze_driver_handle_t)); - res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); + _sampling_hDrivers = (zes_driver_handle_t*) calloc(_sampling_driverCount, sizeof(zes_driver_handle_t)); + res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DRIVER_GET_PTR", res); + _ZE_ERROR_MSG("2nd ZES_DRIVER_GET_PTR", res); return -1; } _sampling_deviceCount = (uint32_t*) calloc(_sampling_driverCount, sizeof(uint32_t)); _sampling_subDeviceCount = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*)); - _sampling_hDevices = (ze_device_handle_t**) calloc(_sampling_driverCount, sizeof(ze_device_handle_t*)); + _sampling_hDevices = (zes_device_handle_t**) calloc(_sampling_driverCount, sizeof(zes_device_handle_t*)); // Query device count for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); @@ -939,7 +939,7 @@ static int initializeHandles() { _ZE_ERROR_MSG("1st ZES_DEVICE_GET_PTR", res); return -1; } - _sampling_hDevices[driverIdx] = (ze_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t)); + _sampling_hDevices[driverIdx] = (zes_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_device_handle_t)); res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("2nd ZES_DEVICE_GET_PTR", res); From 93346ad41b3fa3c0ff6048e3e133755a3531deed Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 17 Sep 2024 17:14:33 +0000 Subject: [PATCH 4/7] set ZES --- xprof/xprof.rb.in | 12 ++++++++---- ze/tracer_ze_helpers.include.c | 6 +++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 285601d2..b8b13473 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -332,8 +332,7 @@ end def sampling? return false unless OPTIONS[:sample] - - env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '0') == '0' || mpi_local_master? + env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '1') == '0' || mpi_local_master? end def env_tracers @@ -386,11 +385,16 @@ def env_tracers end # Sample - if sampling? && mpi_local_master> + # Currently the same so doesn't the tracing, and the sampling + # This mean that is the local rank is not part of the `traced-ranks` + # No sampling will be performed + if sampling? LOGGER.debug('Sampling Enabled') h['LTTNG_UST_SAMPLING'] = 1 h['LTTNG_UST_SAMPLING_ENERGY'] = 1 - h['ZES_ENABLE_SYSMAN'] = 1 if OPTIONS[:'backend-names'].include?('ze') + # The current only reliable way to use zes api + # is to call zesInit and set ZES_ENABLE_SYSMAN to 0 + h['ZES_ENABLE_SYSMAN'] = 0 if OPTIONS[:'backend-names'].include?('ze') end backends = [] unless need_backend diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 31036433..b9f8179d 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -909,9 +909,9 @@ static void intializeEngines() { static int initializeHandles() { ze_result_t res; - const char *e = getenv("ZES_ENABLE_SYSMAN"); - if (!(e && e[0] == '1')) { - fprintf(stderr,"ZES_ENABLE_SYSMAN needs to be set!\n"); + res = zesInit(0); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("zesInit", res); return -1; } From a14a9b0406d609a2fcde1a5cf842762b64c878d2 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 17 Sep 2024 13:52:16 -0500 Subject: [PATCH 5/7] Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau --- ze/tracer_ze_helpers.include.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index b9f8179d..704cb748 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -909,7 +909,7 @@ static void intializeEngines() { static int initializeHandles() { ze_result_t res; - res = zesInit(0); + res = ZES_INIT_PTR(0); if (res != ZE_RESULT_SUCCESS) { _ZE_ERROR_MSG("zesInit", res); return -1; From c2107c7486db2ec478b9fe3db2d9dd8c4c583bfc Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 17 Sep 2024 13:52:21 -0500 Subject: [PATCH 6/7] Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau --- ze/tracer_ze_helpers.include.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 704cb748..164149a9 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -911,7 +911,7 @@ static int initializeHandles() { ze_result_t res; res = ZES_INIT_PTR(0); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("zesInit", res); + _ZE_ERROR_MSG("ZES_INIT_PTR", res); return -1; } From e00e1205a71a0127037ca52f6cf6809b4a30c90c Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 17 Sep 2024 14:47:38 -0500 Subject: [PATCH 7/7] Update xprof/xprof.rb.in --- xprof/xprof.rb.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index b8b13473..1c3dde99 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -385,7 +385,7 @@ def env_tracers end # Sample - # Currently the same so doesn't the tracing, and the sampling + # Currently the same `so` does the tracing, and the sampling # This mean that is the local rank is not part of the `traced-ranks` # No sampling will be performed if sampling?