Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Single rank profiling #288

Merged
merged 7 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions xprof/xprof.rb.in
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,7 @@ end

def sampling?
return false unless OPTIONS[:sample]

env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '0') == '0' || mpi_local_master?
env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '1') == '0' || mpi_local_master?
end

def env_tracers
Expand Down Expand Up @@ -386,11 +385,16 @@ def env_tracers
end

# Sample
# Currently the same `so` does the tracing, and the sampling
# This mean that is the local rank is not part of the `traced-ranks`
# No sampling will be performed
if sampling?
LOGGER.debug('Sampling Enabled')
h['LTTNG_UST_SAMPLING'] = 1
h['LTTNG_UST_SAMPLING_ENERGY'] = 1
h['ZES_ENABLE_SYSMAN'] = 1 if OPTIONS[:'backend-names'].include?('ze')
# The current only reliable way to use zes api
# is to call zesInit and set ZES_ENABLE_SYSMAN to 0
h['ZES_ENABLE_SYSMAN'] = 0 if OPTIONS[:'backend-names'].include?('ze')
end

backends = [] unless need_backend
Expand Down
46 changes: 21 additions & 25 deletions ze/tracer_ze_helpers.include.c
Original file line number Diff line number Diff line change
Expand Up @@ -793,8 +793,8 @@ static int _sampling_freq_initialized = 0;
static int _sampling_pwr_initialized = 0;
static int _sampling_engines_initialized = 0;
// Static handles to stay throughout the execution
static ze_driver_handle_t* _sampling_hDrivers = NULL;
static ze_device_handle_t** _sampling_hDevices = NULL;
static zes_driver_handle_t* _sampling_hDrivers = NULL;
static zes_device_handle_t** _sampling_hDevices = NULL;
static zes_freq_handle_t*** _sampling_hFrequencies = NULL;
static zes_pwr_handle_t*** _sampling_hPowers = NULL;
static zes_engine_handle_t*** _sampling_engineHandles = NULL;
Expand Down Expand Up @@ -909,58 +909,54 @@ static void intializeEngines() {

static int initializeHandles() {
ze_result_t res;
const char *e = getenv("ZES_ENABLE_SYSMAN");
if (!(e && e[0] == '1')) {
fprintf(stderr,"ZES_ENABLE_SYSMAN needs to be set!\n");
return -1;
}
#ifdef CALL_ZEINIT
res = zeInit(ZE_INIT_FLAG_GPU_ONLY);
res = ZES_INIT_PTR(0);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("zeInit", res);
_ZE_ERROR_MSG("ZES_INIT_PTR", res);
return -1;
}
#endif

// Query driver
_sampling_driverCount = 0;
res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, NULL);
res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, NULL);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("1st ZE_DRIVER_GET_PTR", res);
_ZE_ERROR_MSG("1st ZES_DRIVER_GET_PTR", res);
return -1;
}
_sampling_hDrivers = (ze_driver_handle_t*) calloc(_sampling_driverCount, sizeof(ze_driver_handle_t));
res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers);
_sampling_hDrivers = (zes_driver_handle_t*) calloc(_sampling_driverCount, sizeof(zes_driver_handle_t));
res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("2nd ZE_DRIVER_GET_PTR", res);
_ZE_ERROR_MSG("2nd ZES_DRIVER_GET_PTR", res);
return -1;
}
_sampling_deviceCount = (uint32_t*) calloc(_sampling_driverCount, sizeof(uint32_t));
_sampling_subDeviceCount = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*));
_sampling_hDevices = (ze_device_handle_t**) calloc(_sampling_driverCount, sizeof(ze_device_handle_t*));
_sampling_hDevices = (zes_device_handle_t**) calloc(_sampling_driverCount, sizeof(zes_device_handle_t*));
// Query device count
for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) {
res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL);
res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL);
if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount[driverIdx] == 0) {
fprintf(stderr, "ERROR: No device found!\n");
_ZE_ERROR_MSG("1st ZE_DEVICE_GET_PTR", res);
_ZE_ERROR_MSG("1st ZES_DEVICE_GET_PTR", res);
return -1;
}
_sampling_hDevices[driverIdx] = (ze_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t));
res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]);
_sampling_hDevices[driverIdx] = (zes_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_device_handle_t));
res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("2nd ZE_DEVICE_GET_PTR", res);
_ZE_ERROR_MSG("2nd ZES_DEVICE_GET_PTR", res);
free(_sampling_hDevices[driverIdx]);
return -1;
}
//Get no sub-devices
_sampling_subDeviceCount[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t));
for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) {
res = ZE_DEVICE_GET_SUB_DEVICES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_subDeviceCount[driverIdx][deviceIdx], NULL);
zes_device_properties_t deviceProperties = {0};
deviceProperties.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
res = ZES_DEVICE_GET_PROPERTIES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &deviceProperties);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res);
_ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res);
_sampling_subDeviceCount[driverIdx][deviceIdx] = 0;
}
} else
_sampling_subDeviceCount[driverIdx][deviceIdx] = deviceProperties.numSubdevices;
if (_sampling_subDeviceCount[driverIdx][deviceIdx] == 0) {
_sampling_subDeviceCount[driverIdx][deviceIdx] = 1;
}
Expand Down