From 74d80054acc55250178e89a32e075fd38600aacc Mon Sep 17 00:00:00 2001 From: sbekele Date: Tue, 23 Jul 2024 21:04:06 +0000 Subject: [PATCH] memory sampling --- xprof/btx_interval_model.yaml | 36 ++++++++++++++++ xprof/btx_timeline.cpp | 73 ++++++++++++++++++++++++++------- ze/btx_zeinterval_callbacks.cpp | 59 +++++++++++++++++++++++++- ze/btx_zeinterval_callbacks.hpp | 5 +++ ze/tracer_ze_helpers.include.c | 69 +++++++++++++++++++++++++++++++ ze/ze_events.yaml | 20 +++++++++ 6 files changed, 245 insertions(+), 17 deletions(-) diff --git a/xprof/btx_interval_model.yaml b/xprof/btx_interval_model.yaml index eec61cac..b41d82b7 100644 --- a/xprof/btx_interval_model.yaml +++ b/xprof/btx_interval_model.yaml @@ -257,3 +257,39 @@ :field_class: :type: double :cast_type: float + - :name: lttng:memModule + :payload_field_class: + :type: structure + :members: + - :name: did + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: deviceIdx + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: hMemModule + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: subDevice + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: rdBandwidth + :field_class: + :type: double + :cast_type: float + - :name: wtBandwidth + :field_class: + :type: double + :cast_type: float + - :name: occupancy + :field_class: + :type: double + :cast_type: float diff --git a/xprof/btx_timeline.cpp b/xprof/btx_timeline.cpp index cfde74ae..6e7e12a2 100644 --- a/xprof/btx_timeline.cpp +++ b/xprof/btx_timeline.cpp @@ -32,11 +32,12 @@ struct timeline_dispatch_s { std::unordered_map hp_device2countertracks; std::unordered_map hp_ddomain2telmtracks; + std::unordered_map hp_ddomain2cpytracks; std::unordered_map hp_dfsdev2fptracks; perfetto_pruned::Trace trace; }; -struct FabricDetails { +struct Details { bool RxTx; uint32_t fabricId; uint32_t remotePortId; @@ -44,7 +45,7 @@ struct FabricDetails { using timeline_dispatch_t = struct timeline_dispatch_s; using uuid_getter_t = perfetto_uuid_t (*)(timeline_dispatch_t *, const std::string &, uint64_t, uint64_t, uint32_t, uint64_t, - uint32_t, std::optional); + uint32_t, std::optional
); static perfetto_uuid_t gen_perfetto_uuid() { // Start at one, Look like UUID 0 is special @@ -87,7 +88,7 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch, std::unordered_map &counter_tracks, const std::string &track_name, const std::string &hostname, uint64_t process_id, thapi_device_id did, uint32_t deviceIdx, uint64_t tHandle, thapi_domain_idx domain, - std::optional details = std::nullopt, + std::optional
details = std::nullopt, std::unordered_map *counter_tracks_fp = nullptr) { perfetto_uuid_t hp_dev_uuid = 0; perfetto_uuid_t hp_uuid; @@ -124,7 +125,14 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch, oss << track_name << " | SD " << domain; oss << " | " << details->fabricId << "<->" << details->remotePortId << " | " <<(details->RxTx ? " TX" : " RX"); } - else if (track_name=="CopyEngine (%)" || track_name=="ComputeEngine (%)") { + else if (track_name==" Memory" && details) { + oss << track_name << " Module " << domain; + oss << " | " <<(details->RxTx ? "WR BW" : "RD BW"); + } + else if (track_name==" Memory Allocation (%)") { + oss << track_name << " Module " << domain; + } + else if (track_name==" CopyEngine (%)" || track_name==" ComputeEngine (%)") { oss << track_name << " | SubDevice " << domain; } else { @@ -136,36 +144,47 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch, } static perfetto_uuid_t get_copyEU_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id, - uint64_t did, uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, std::optional options) { - return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "CopyEngine (%)", hostname, process_id, did,deviceIdx, hEngine, subDevice); + uint64_t did, uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, std::optional
options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2cpytracks, " CopyEngine (%)", hostname, process_id, did,deviceIdx, hEngine, subDevice); } static perfetto_uuid_t get_computeEU_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id, - uint64_t did, uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, std::optional options ) { - return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "ComputeEngine (%)", hostname, process_id, did, deviceIdx, hEngine, subDevice); + uint64_t did, uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, std::optional
options ) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " ComputeEngine (%)", hostname, process_id, did, deviceIdx, hEngine, subDevice); } static perfetto_uuid_t get_fpThroughput_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id, - uint64_t did, uint32_t deviceIdx, uint64_t hFabricPort, uint32_t subDevice, std::optional options) { + uint64_t did, uint32_t deviceIdx, uint64_t hFabricPort, uint32_t subDevice, std::optional
options) { return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "FabricT", hostname, process_id, did, deviceIdx, hFabricPort, subDevice, options, &dispatch->hp_dfsdev2fptracks); } static perfetto_uuid_t get_power_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id, - uint64_t did, uint32_t deviceIdx, uint64_t hPower, uint32_t subDevice, std::optional options) { + uint64_t did, uint32_t deviceIdx, uint64_t hPower, uint32_t subDevice, std::optional
options) { // Extra space to maintain track sequence in the timeline - return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Power", hostname, process_id, did, deviceIdx, hPower, subDevice); + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Power", hostname, process_id, did, deviceIdx, hPower, subDevice); } static perfetto_uuid_t get_frequency_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id, - uint64_t did, uint32_t deviceIdx, uint64_t hFrequency, uint32_t subDevice, std::optional options) { - return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Ferquency", hostname, process_id, did, deviceIdx, hFrequency, subDevice); + uint64_t did, uint32_t deviceIdx, uint64_t hFrequency, uint32_t subDevice, std::optional
options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Ferquency", hostname, process_id, did, deviceIdx, hFrequency, subDevice); } +static perfetto_uuid_t get_Bandwidth_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id, + uint64_t did, uint32_t deviceIdx, uint64_t hMemModule, uint32_t subDevice, std::optional
options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Memory", hostname, process_id, did, deviceIdx, hMemModule, subDevice, options, &dispatch->hp_dfsdev2fptracks); +} + +static perfetto_uuid_t get_Occupancy_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id, + uint64_t did, uint32_t deviceIdx, uint64_t hMemModule, uint32_t subDevice, std::optional
options) { + return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Memory Allocation $", hostname, process_id, did, deviceIdx, hMemModule, subDevice); +} + + static void add_event_DTelemetry(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id, uint64_t thread_id, uint64_t did, uint32_t deviceIdx, uint64_t tHandle, uint32_t subDevice, uint64_t timestamp, float value, uuid_getter_t uuid_getter, const std::string &eventName, - std::optional options = std::nullopt) { + std::optional
options = std::nullopt) { perfetto_uuid_t track_uuid; track_uuid = uuid_getter(dispatch, hostname, process_id, did, deviceIdx, tHandle, subDevice, options); @@ -178,12 +197,26 @@ static void add_event_DTelemetry(timeline_dispatch_t *dispatch, const std::strin track_event->set_double_counter_value(value); } +static void add_event_memModule( timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, uint64_t thread_id, uint64_t did, uint32_t deviceIdx, uintptr_t hMemModule, uint32_t subDevice, uint64_t timestamp, float rdBandwidth, float wtBandwidth, float occupancy) { + // Define details for RX throughput. + Details details = {false, 0, 0}; + add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, deviceIdx, hMemModule, subDevice, timestamp, + rdBandwidth, get_Bandwidth_track_uuuid, "Memory Read BW", details); + details.RxTx = true; + add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, deviceIdx, hMemModule, subDevice, timestamp, + wtBandwidth, get_Bandwidth_track_uuuid, "Memory Write BW", details); + add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, deviceIdx, hMemModule, subDevice, timestamp, + occupancy, get_Occupancy_track_uuuid, "Memory Occupancy", details); +} + + static void add_event_fabricPort( timeline_dispatch_t *dispatch, std::string hostname, uint64_t process_id, uint64_t thread_id, uint64_t did, uint32_t deviceIdx, uintptr_t hFabricPort, uint32_t subDevice, uint64_t timestamp, uint32_t fabricId, uint32_t remotePortId, float rxThroughput, float txThroughput, float rxSpeed, float txSpeed) { // Define details for RX throughput. - FabricDetails details = {false, fabricId, remotePortId}; + Details details = {false, fabricId, remotePortId}; add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, deviceIdx, hFabricPort, subDevice, timestamp, rxThroughput, get_fpThroughput_track_uuuid, "Fabric ThroughputRX", details); @@ -467,6 +500,15 @@ static void fabricPort_usr_callback(void *btx_handle, void *usr_data, const char remotePortId, rxThroughput, txThroughput, rxSpeed, txSpeed); } +static void memModule_usr_callback(void *btx_handle, void *usr_data, const char *hostname, + int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend, + uint64_t did, uint32_t deviceIdx, uint64_t hMemModule, uint32_t subDevice, + float rdBandwidth, float wtBandwidth, float occupancy) { + auto *dispatch = static_cast(usr_data); + add_event_memModule(dispatch, hostname, vpid, vtid, did, deviceIdx, hMemModule, subDevice, ts, rdBandwidth, wtBandwidth, occupancy); +} + + void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_lttng_host(btx_handle, &host_usr_callback); btx_register_callbacks_lttng_device(btx_handle, &device_usr_callback); @@ -475,6 +517,7 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_lttng_computeEU(btx_handle, &computeEU_usr_callback); btx_register_callbacks_lttng_copyEU(btx_handle, ©EU_usr_callback); btx_register_callbacks_lttng_fabricPort(btx_handle, &fabricPort_usr_callback); + btx_register_callbacks_lttng_memModule(btx_handle, &memModule_usr_callback); btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component_callback); btx_register_callbacks_read_params(btx_handle, &read_params_callback); btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component_callback); diff --git a/ze/btx_zeinterval_callbacks.cpp b/ze/btx_zeinterval_callbacks.cpp index 37b56feb..79acf5cf 100644 --- a/ze/btx_zeinterval_callbacks.cpp +++ b/ze/btx_zeinterval_callbacks.cpp @@ -797,7 +797,7 @@ static void lttng_ust_ze_sampling_fabricPort_callback(void *btx_handle, void *us auto subDevice = it0->second.subdeviceId; auto fabricId = it0->second.portId.fabricId; auto remotePortId = pFabricPortState_val->remotePortId.fabricId; - // Current Speed + // Current Speed (not used currently in the timeline) double rxSpeed = static_cast(pFabricPortState_val->rxSpeed.bitRate * pFabricPortState_val->rxSpeed.width)/8.0; double txSpeed = static_cast(pFabricPortState_val->txSpeed.bitRate * pFabricPortState_val->txSpeed.width)/8.0; @@ -832,6 +832,47 @@ static void lttng_ust_ze_sampling_fabricPort_callback(void *btx_handle, void *us } } +static void lttng_ust_ze_sampling_memStats_callback(void *btx_handle, void *usr_data, int64_t ts, + const char *hostname, int64_t vpid, + uint64_t vtid, ze_device_handle_t hDevice, + zes_mem_handle_t hMemModule, + size_t _pMemState_val_length, + zes_mem_state_t *pMemState_val, + size_t _pMemBandwidth_val_length, + zes_mem_bandwidth_t *pMemBandwidth_val) { + auto *data = static_cast(usr_data); + const auto it0 = data->memModule_property.find({hostname, vpid, hDevice, hMemModule}); + if (it0 != data->memModule_property.cend()) { + // Get memModule properties: subdevice ID ... + auto subDevice = it0->second.subdeviceId; + // Insert the current bandwidth data with timestamp + auto [it, inserted] = data->device_memModule_ref.insert( + {{hostname, vpid, hDevice, hMemModule, subDevice}, {*pMemBandwidth_val, ts}}); + if (inserted) + return; + + // Previous bandwidth data + auto &[prev_bandwidth, prev_ts] = it->second; + + if (pMemBandwidth_val->timestamp == prev_bandwidth.timestamp) + return; + // Calculate the RD and WT bandwidth + //https://spec.oneapi.io/level-zero/latest/sysman/api.html#_CPPv419zes_mem_bandwidth_t + double allocation = static_cast(pMemState_val->size - pMemState_val->free) * 100.0 / static_cast(pMemState_val->size); + double time_diff = static_cast(pMemBandwidth_val->timestamp - prev_bandwidth.timestamp); + double rdBandwidth = static_cast(pMemBandwidth_val->readCounter - prev_bandwidth.readCounter) * 1e6 / (time_diff * pMemBandwidth_val->maxBandwidth); + double wtBandwidth = static_cast(pMemBandwidth_val->writeCounter - prev_bandwidth.writeCounter) * 1e6 / (time_diff * pMemBandwidth_val->maxBandwidth); + DeviceHash uuid_idx = get_device_hash(usr_data, hostname, vpid, hDevice); + btx_push_message_lttng_memModule(btx_handle, hostname, 0, 0, prev_ts, BACKEND_ZE, + uuid_idx.hash, uuid_idx.deviceIdx, (uint64_t)hMemModule, subDevice, + rdBandwidth, wtBandwidth, allocation); + // Update the stored values + it->second = {*pMemBandwidth_val, ts}; + } else { + std::cerr << "Memory property not found!" << std::endl; + } +} + static void lttng_ust_ze_sampling_engineStats_callback(void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, ze_device_handle_t hDevice, @@ -842,7 +883,7 @@ static void lttng_ust_ze_sampling_engineStats_callback(void *btx_handle, void *u const auto it0 = data->engine_property.find({hostname, vpid, hDevice, hEngine}); if (it0 != data->engine_property.cend()) { const auto& engineProps = it0->second; - uint32_t subDevice = engineProps.subdeviceId; // (engineProps.onSubdevice) ? engineProps.subdeviceId : 0; + uint32_t subDevice = engineProps.subdeviceId; if (engineProps.type == ZES_ENGINE_GROUP_COMPUTE_ALL || engineProps.type == ZES_ENGINE_GROUP_COPY_ALL) { auto [it, inserted] = data->device_engines_ref.insert( @@ -931,6 +972,16 @@ static void lttng_ust_ze_sampling_fabricPortProperties_callback(void *btx_handle data->fabricPort_property[{hostname, vpid, (ze_device_handle_t)hDevice, (zes_fabric_port_handle_t)hFabricPort}] = *pFabricPortProperties_val; } +static void lttng_ust_ze_sampling_memoryProperties_callback(void *btx_handle, void *usr_data, int64_t ts, + const char *hostname, int64_t vpid, uint64_t vtid, + ze_device_handle_t hDevice, zes_mem_handle_t hMemModule, + size_t _pMemModuleProperties_val_length, + zes_mem_properties_t *pMemModuleProperties_val) { + auto *data = static_cast(usr_data); + data->memModule_property[{hostname, vpid, (ze_device_handle_t)hDevice, (zes_mem_handle_t)hMemModule}] = *pMemModuleProperties_val; +} + + static void lttng_ust_ze_sampling_powerProperties_callback(void *btx_handle, void *usr_data, int64_t ts, const char *hostname, int64_t vpid, uint64_t vtid, ze_device_handle_t hDevice, zes_pwr_handle_t hPower, @@ -1070,8 +1121,12 @@ void btx_register_usr_callbacks(void *btx_handle) { btx_handle, <tng_ust_ze_sampling_engineProperties_callback); btx_register_callbacks_lttng_ust_ze_sampling_freqProperties( btx_handle, <tng_ust_ze_sampling_freqProperties_callback); + btx_register_callbacks_lttng_ust_ze_sampling_memoryProperties( + btx_handle, <tng_ust_ze_sampling_memoryProperties_callback); // Telemetries + btx_register_callbacks_lttng_ust_ze_sampling_memStats( + btx_handle, <tng_ust_ze_sampling_memStats_callback); btx_register_callbacks_lttng_ust_ze_sampling_fabricPort( btx_handle, <tng_ust_ze_sampling_fabricPort_callback); btx_register_callbacks_lttng_ust_ze_sampling_gpu_energy( diff --git a/ze/btx_zeinterval_callbacks.hpp b/ze/btx_zeinterval_callbacks.hpp index 53b49569..8349152d 100644 --- a/ze/btx_zeinterval_callbacks.hpp +++ b/ze/btx_zeinterval_callbacks.hpp @@ -23,6 +23,7 @@ typedef std::tuple hp_module_t; typedef std::map memory_interval_t; typedef std::tuple clock_lttng_device_t; +typedef std::tuple memModule_timestamp_t; typedef std::tuple fabricPort_timestamp_t; typedef std::tuple energy_timestamp_t; typedef std::tuple engines_timestamp_t; @@ -34,10 +35,12 @@ typedef std::tuple hpdd_ typedef std::tuple hpdsd_t; typedef std::tuple hpdf_t; +typedef std::tuple hpdm_t; typedef std::tuple hpdpwr_t; typedef std::tuple hpdfreq_t; typedef std::tuple hpdeng_t; +typedef std::tuple hpdmsd_t; typedef std::tuple hpdfsd_t; typedef std::tuple hpdesd_t; typedef std::tuple hpdpwrd_t; @@ -103,6 +106,7 @@ struct data_s { std::unordered_map sampling_device_property; std::unordered_map sampling_sub_device_property; std::unordered_map fabricPort_property; + std::unordered_map memModule_property; std::unordered_map power_property; std::unordered_map frequency_property; std::unordered_map engine_property; @@ -110,5 +114,6 @@ struct data_s { std::unordered_map device_energy_ref; std::unordered_map device_engines_ref; std::unordered_map device_fabricPort_ref; + std::unordered_map device_memModule_ref; }; typedef struct data_s data_t; diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 446970cf..436234fc 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -791,6 +791,7 @@ static inline void _dump_memory_info(ze_command_list_handle_t hCommandList, cons static int _sampling_freq_initialized = 0; static int _sampling_fabricPorts_initialized = 0; +static int _sampling_memModules_initialized = 0; static int _sampling_pwr_initialized = 0; static int _sampling_engines_initialized = 0; // Static handles to stay throughout the execution @@ -801,11 +802,13 @@ static zes_freq_handle_t*** _sampling_hFrequencies = NULL; static zes_pwr_handle_t*** _sampling_hPowers = NULL; static zes_engine_handle_t*** _sampling_engineHandles = NULL; static zes_fabric_port_handle_t*** _sampling_hFabricPort = NULL; +static zes_mem_handle_t*** _sampling_hMemModule = NULL; static uint32_t _sampling_driverCount = 0; static uint32_t* _sampling_deviceCount = NULL; static uint32_t** _sampling_subDeviceCount = NULL; static uint32_t** _sampling_freqDomainCounts = NULL; static uint32_t** _sampling_fabricPortCount = NULL; +static uint32_t** _sampling_memModuleCount = NULL; static uint32_t** _sampling_powerDomainCounts = NULL; static uint32_t** _sampling_engineCounts = NULL; @@ -966,6 +969,46 @@ static void intializeFabricPorts() { _sampling_fabricPorts_initialized = 1; } +static void intializeMemModules() { + ze_result_t res; + _sampling_hMemModule = (zes_mem_handle_t***) calloc(_sampling_driverCount, sizeof(zes_mem_handle_t**)); + _sampling_memModuleCount = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*)); + for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { + _sampling_memModuleCount[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); + _sampling_hMemModule[driverIdx] = (zes_mem_handle_t**) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_mem_handle_t*)); + for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + // Get fabric ports for each device + _sampling_memModuleCount[driverIdx][deviceIdx]=0; + res = ZES_DEVICE_ENUM_MEMORY_MODULES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_memModuleCount[driverIdx][deviceIdx], NULL); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("1st ZES_DEVICE_ENUM_MEMORY_MODULES_PTR", res); + _sampling_memModuleCount[driverIdx][deviceIdx] = 0; + continue; + } + _sampling_hMemModule[driverIdx][deviceIdx] = (zes_mem_handle_t*) calloc(_sampling_memModuleCount[driverIdx][deviceIdx], sizeof(zes_mem_handle_t)); + res = ZES_DEVICE_ENUM_MEMORY_MODULES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_memModuleCount[driverIdx][deviceIdx], _sampling_hMemModule[driverIdx][deviceIdx]); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("2nd ZES_DEVICE_ENUM_MEMORY_MODULES_PTR", res); + _sampling_memModuleCount[driverIdx][deviceIdx] = 0; + free(_sampling_hMemModule[driverIdx][deviceIdx]); + } + for (uint32_t memModuleIdx = 0; memModuleIdx < _sampling_memModuleCount[driverIdx][deviceIdx]; ++memModuleIdx) { + zes_mem_properties_t memProps = {0}; + memProps.stype = ZES_STRUCTURE_TYPE_MEM_PROPERTIES; + res = ZES_MEMORY_GET_PROPERTIES_PTR(_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], &memProps); + if (res != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_MEMORY_GET_PROPERTIES_PTR", res); + } + //Dump fabricPortProperties once + do_tracepoint(lttng_ust_ze_sampling, memoryProperties, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_mem_handle_t)_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], + &memProps); + } + } + } + _sampling_memModules_initialized = 1; +} + static int initializeHandles() { ze_result_t res; const char *e = getenv("ZES_ENABLE_SYSMAN"); @@ -1060,6 +1103,7 @@ static int initializeHandles() { intializePower(); intializeEngines(); intializeFabricPorts(); + intializeMemModules(); return 0; } @@ -1099,6 +1143,28 @@ static void readFabricPorts_dump(uint32_t driverIdx, uint32_t deviceIdx) { } } +static void readMemModules_dump(uint32_t driverIdx, uint32_t deviceIdx) { + if (!_sampling_memModules_initialized) return; + ze_result_t result; + for (uint32_t memModuleIdx = 0; memModuleIdx < _sampling_memModuleCount[driverIdx][deviceIdx]; ++memModuleIdx) { + zes_mem_state_t memState = {0}; + memState.stype = ZES_STRUCTURE_TYPE_MEM_STATE; + zes_mem_bandwidth_t memBandwidth = {0}; + result = ZES_MEMORY_GET_STATE_PTR(_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], &memState); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_MEMORY_GET_STATE_PTR", result); + continue; + } + result = ZES_MEMORY_GET_BANDWIDTH_PTR(_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], &memBandwidth); + if (result != ZE_RESULT_SUCCESS) { + _ZE_ERROR_MSG("ZES_MEMORY_GET_BANDWIDTH_PTR", result); + continue; + } + do_tracepoint(lttng_ust_ze_sampling, memStats, (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], + (zes_mem_handle_t)_sampling_hMemModule[driverIdx][deviceIdx][memModuleIdx], &memState, &memBandwidth); + } +} + static void readEnergy_dump(uint32_t driverIdx, uint32_t deviceIdx) { if (!_sampling_pwr_initialized) return; ze_result_t result; @@ -1142,6 +1208,9 @@ static void thapi_sampling_energy() { if (tracepoint_enabled(lttng_ust_ze_sampling, engineStats)){ readEngines_dump(driverIdx, deviceIdx); } + if (tracepoint_enabled(lttng_ust_ze_sampling, memStats)){ + readMemModules_dump(driverIdx, deviceIdx); + } } } } diff --git a/ze/ze_events.yaml b/ze/ze_events.yaml index c9109254..3a157aaf 100644 --- a/ze/ze_events.yaml +++ b/ze/ze_events.yaml @@ -97,6 +97,26 @@ lttng_ust_ze_sampling: - [ ctf_integer_hex, uintptr_t, hFabricPort, "(uintptr_t)hFabricPort" ] - [ ctf_sequence_text, uint8_t, pFabricPortState_val, pFabricPortState, size_t, "sizeof(zes_fabric_port_state_t)" ] - [ ctf_sequence_text, uint8_t, pFabricPortThroughput_val, pFabricPortThroughput, size_t, "sizeof(zes_fabric_port_throughput_t)" ] + - name: memoryProperties + args: + - [ ze_device_handle_t, hDevice ] + - [ zes_mem_handle_t, hMemModule ] + - [ zes_mem_properties_t *, pMemModuleProperties ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hMemModule, "(uintptr_t)hMemModule" ] + - [ ctf_sequence_text, uint8_t, pMemModuleProperties_val, pMemModuleProperties, size_t, "sizeof(zes_mem_properties_t)" ] + - name: memStats + args: + - [ ze_device_handle_t, hDevice ] + - [ zes_mem_handle_t, hMemModule ] + - [ zes_mem_state_t *, pMemState ] + - [ zes_mem_bandwidth_t *, pMemBandwidth ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer_hex, uintptr_t, hMemModule, "(uintptr_t)hMemModule" ] + - [ ctf_sequence_text, uint8_t, pMemState_val, pMemState, size_t, "sizeof(zes_mem_state_t)" ] + - [ ctf_sequence_text, uint8_t, pMemBandwidth_val, pMemBandwidth, size_t, "sizeof(zes_mem_bandwidth_t)" ] lttng_ust_ze_profiling: events: - name: event_profiling