Skip to content

Commit

Permalink
memory sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
solo2abera committed Jul 23, 2024
1 parent b76ca86 commit 74d8005
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 17 deletions.
36 changes: 36 additions & 0 deletions xprof/btx_interval_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,39 @@
:field_class:
:type: double
:cast_type: float
- :name: lttng:memModule
:payload_field_class:
:type: structure
:members:
- :name: did
:field_class:
:type: integer_unsigned
:field_value_range: 64
:cast_type: uint64_t
- :name: deviceIdx
:field_class:
:type: integer_unsigned
:field_value_range: 32
:cast_type: uint32_t
- :name: hMemModule
:field_class:
:type: integer_unsigned
:field_value_range: 64
:cast_type: uint64_t
- :name: subDevice
:field_class:
:type: integer_unsigned
:field_value_range: 32
:cast_type: uint32_t
- :name: rdBandwidth
:field_class:
:type: double
:cast_type: float
- :name: wtBandwidth
:field_class:
:type: double
:cast_type: float
- :name: occupancy
:field_class:
:type: double
:cast_type: float
73 changes: 58 additions & 15 deletions xprof/btx_timeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,20 @@ struct timeline_dispatch_s {

std::unordered_map<h_device_t, perfetto_uuid_t> hp_device2countertracks;
std::unordered_map<hp_ddomain_t, perfetto_uuid_t> hp_ddomain2telmtracks;
std::unordered_map<hp_ddomain_t, perfetto_uuid_t> hp_ddomain2cpytracks;
std::unordered_map<hp_dfsdev_t, perfetto_uuid_t> hp_dfsdev2fptracks;
perfetto_pruned::Trace trace;
};

struct FabricDetails {
struct Details {
bool RxTx;
uint32_t fabricId;
uint32_t remotePortId;
};

using timeline_dispatch_t = struct timeline_dispatch_s;
using uuid_getter_t = perfetto_uuid_t (*)(timeline_dispatch_t *, const std::string &, uint64_t, uint64_t, uint32_t, uint64_t,
uint32_t, std::optional<FabricDetails>);
uint32_t, std::optional<Details>);

static perfetto_uuid_t gen_perfetto_uuid() {
// Start at one, Look like UUID 0 is special
Expand Down Expand Up @@ -87,7 +88,7 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch,
std::unordered_map<hp_ddomain_t, perfetto_uuid_t> &counter_tracks,
const std::string &track_name, const std::string &hostname, uint64_t process_id,
thapi_device_id did, uint32_t deviceIdx, uint64_t tHandle, thapi_domain_idx domain,
std::optional<FabricDetails> details = std::nullopt,
std::optional<Details> details = std::nullopt,
std::unordered_map<hp_dfsdev_t, perfetto_uuid_t> *counter_tracks_fp = nullptr) {
perfetto_uuid_t hp_dev_uuid = 0;
perfetto_uuid_t hp_uuid;
Expand Down Expand Up @@ -124,7 +125,14 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch,
oss << track_name << " | SD " << domain;
oss << " | " << details->fabricId << "<->" << details->remotePortId << " | " <<(details->RxTx ? " TX" : " RX");
}
else if (track_name=="CopyEngine (%)" || track_name=="ComputeEngine (%)") {
else if (track_name==" Memory" && details) {
oss << track_name << " Module " << domain;
oss << " | " <<(details->RxTx ? "WR BW" : "RD BW");
}
else if (track_name==" Memory Allocation (%)") {
oss << track_name << " Module " << domain;
}
else if (track_name==" CopyEngine (%)" || track_name==" ComputeEngine (%)") {
oss << track_name << " | SubDevice " << domain;
}
else {
Expand All @@ -136,36 +144,47 @@ static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch,
}

static perfetto_uuid_t get_copyEU_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id,
uint64_t did, uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, std::optional<FabricDetails> options) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "CopyEngine (%)", hostname, process_id, did,deviceIdx, hEngine, subDevice);
uint64_t did, uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, std::optional<Details> options) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2cpytracks, " CopyEngine (%)", hostname, process_id, did,deviceIdx, hEngine, subDevice);
}

static perfetto_uuid_t get_computeEU_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id,
uint64_t did, uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, std::optional<FabricDetails> options ) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "ComputeEngine (%)", hostname, process_id, did, deviceIdx, hEngine, subDevice);
uint64_t did, uint32_t deviceIdx, uint64_t hEngine, uint32_t subDevice, std::optional<Details> options ) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " ComputeEngine (%)", hostname, process_id, did, deviceIdx, hEngine, subDevice);
}

static perfetto_uuid_t get_fpThroughput_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id,
uint64_t did, uint32_t deviceIdx, uint64_t hFabricPort, uint32_t subDevice, std::optional<FabricDetails> options) {
uint64_t did, uint32_t deviceIdx, uint64_t hFabricPort, uint32_t subDevice, std::optional<Details> options) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, "FabricT", hostname, process_id, did, deviceIdx, hFabricPort, subDevice, options, &dispatch->hp_dfsdev2fptracks);
}

static perfetto_uuid_t get_power_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id,
uint64_t did, uint32_t deviceIdx, uint64_t hPower, uint32_t subDevice, std::optional<FabricDetails> options) {
uint64_t did, uint32_t deviceIdx, uint64_t hPower, uint32_t subDevice, std::optional<Details> options) {
// Extra space to maintain track sequence in the timeline
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Power", hostname, process_id, did, deviceIdx, hPower, subDevice);
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Power", hostname, process_id, did, deviceIdx, hPower, subDevice);
}


static perfetto_uuid_t get_frequency_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id,
uint64_t did, uint32_t deviceIdx, uint64_t hFrequency, uint32_t subDevice, std::optional<FabricDetails> options) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Ferquency", hostname, process_id, did, deviceIdx, hFrequency, subDevice);
uint64_t did, uint32_t deviceIdx, uint64_t hFrequency, uint32_t subDevice, std::optional<Details> options) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Ferquency", hostname, process_id, did, deviceIdx, hFrequency, subDevice);
}

static perfetto_uuid_t get_Bandwidth_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id,
uint64_t did, uint32_t deviceIdx, uint64_t hMemModule, uint32_t subDevice, std::optional<Details> options) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Memory", hostname, process_id, did, deviceIdx, hMemModule, subDevice, options, &dispatch->hp_dfsdev2fptracks);
}

static perfetto_uuid_t get_Occupancy_track_uuuid(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id,
uint64_t did, uint32_t deviceIdx, uint64_t hMemModule, uint32_t subDevice, std::optional<Details> options) {
return get_counter_track_uuuid(dispatch, dispatch->hp_ddomain2telmtracks, " Memory Allocation $", hostname, process_id, did, deviceIdx, hMemModule, subDevice);
}


static void add_event_DTelemetry(timeline_dispatch_t *dispatch, const std::string &hostname, uint64_t process_id,
uint64_t thread_id, uint64_t did, uint32_t deviceIdx, uint64_t tHandle, uint32_t subDevice,
uint64_t timestamp, float value, uuid_getter_t uuid_getter, const std::string &eventName,
std::optional<FabricDetails> options = std::nullopt) {
std::optional<Details> options = std::nullopt) {
perfetto_uuid_t track_uuid;
track_uuid = uuid_getter(dispatch, hostname, process_id, did, deviceIdx, tHandle, subDevice, options);

Expand All @@ -178,12 +197,26 @@ static void add_event_DTelemetry(timeline_dispatch_t *dispatch, const std::strin
track_event->set_double_counter_value(value);
}

static void add_event_memModule( timeline_dispatch_t *dispatch, std::string hostname,
uint64_t process_id, uint64_t thread_id, uint64_t did, uint32_t deviceIdx, uintptr_t hMemModule, uint32_t subDevice, uint64_t timestamp, float rdBandwidth, float wtBandwidth, float occupancy) {
// Define details for RX throughput.
Details details = {false, 0, 0};
add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, deviceIdx, hMemModule, subDevice, timestamp,
rdBandwidth, get_Bandwidth_track_uuuid, "Memory Read BW", details);
details.RxTx = true;
add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, deviceIdx, hMemModule, subDevice, timestamp,
wtBandwidth, get_Bandwidth_track_uuuid, "Memory Write BW", details);
add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, deviceIdx, hMemModule, subDevice, timestamp,
occupancy, get_Occupancy_track_uuuid, "Memory Occupancy", details);
}


static void add_event_fabricPort( timeline_dispatch_t *dispatch, std::string hostname,
uint64_t process_id, uint64_t thread_id, uint64_t did, uint32_t deviceIdx, uintptr_t hFabricPort,
uint32_t subDevice, uint64_t timestamp, uint32_t fabricId, uint32_t remotePortId,
float rxThroughput, float txThroughput, float rxSpeed, float txSpeed) {
// Define details for RX throughput.
FabricDetails details = {false, fabricId, remotePortId};
Details details = {false, fabricId, remotePortId};
add_event_DTelemetry(dispatch, hostname, process_id, thread_id, did, deviceIdx, hFabricPort, subDevice, timestamp,
rxThroughput, get_fpThroughput_track_uuuid, "Fabric ThroughputRX", details);

Expand Down Expand Up @@ -467,6 +500,15 @@ static void fabricPort_usr_callback(void *btx_handle, void *usr_data, const char
remotePortId, rxThroughput, txThroughput, rxSpeed, txSpeed);
}

static void memModule_usr_callback(void *btx_handle, void *usr_data, const char *hostname,
int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend,
uint64_t did, uint32_t deviceIdx, uint64_t hMemModule, uint32_t subDevice,
float rdBandwidth, float wtBandwidth, float occupancy) {
auto *dispatch = static_cast<timeline_dispatch_t *>(usr_data);
add_event_memModule(dispatch, hostname, vpid, vtid, did, deviceIdx, hMemModule, subDevice, ts, rdBandwidth, wtBandwidth, occupancy);
}


void btx_register_usr_callbacks(void *btx_handle) {
btx_register_callbacks_lttng_host(btx_handle, &host_usr_callback);
btx_register_callbacks_lttng_device(btx_handle, &device_usr_callback);
Expand All @@ -475,6 +517,7 @@ void btx_register_usr_callbacks(void *btx_handle) {
btx_register_callbacks_lttng_computeEU(btx_handle, &computeEU_usr_callback);
btx_register_callbacks_lttng_copyEU(btx_handle, &copyEU_usr_callback);
btx_register_callbacks_lttng_fabricPort(btx_handle, &fabricPort_usr_callback);
btx_register_callbacks_lttng_memModule(btx_handle, &memModule_usr_callback);
btx_register_callbacks_initialize_component(btx_handle, &btx_initialize_component_callback);
btx_register_callbacks_read_params(btx_handle, &read_params_callback);
btx_register_callbacks_finalize_component(btx_handle, &btx_finalize_component_callback);
Expand Down
59 changes: 57 additions & 2 deletions ze/btx_zeinterval_callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ static void lttng_ust_ze_sampling_fabricPort_callback(void *btx_handle, void *us
auto subDevice = it0->second.subdeviceId;
auto fabricId = it0->second.portId.fabricId;
auto remotePortId = pFabricPortState_val->remotePortId.fabricId;
// Current Speed
// Current Speed (not used currently in the timeline)
double rxSpeed = static_cast<double>(pFabricPortState_val->rxSpeed.bitRate * pFabricPortState_val->rxSpeed.width)/8.0;
double txSpeed = static_cast<double>(pFabricPortState_val->txSpeed.bitRate * pFabricPortState_val->txSpeed.width)/8.0;

Expand Down Expand Up @@ -832,6 +832,47 @@ static void lttng_ust_ze_sampling_fabricPort_callback(void *btx_handle, void *us
}
}

static void lttng_ust_ze_sampling_memStats_callback(void *btx_handle, void *usr_data, int64_t ts,
const char *hostname, int64_t vpid,
uint64_t vtid, ze_device_handle_t hDevice,
zes_mem_handle_t hMemModule,
size_t _pMemState_val_length,
zes_mem_state_t *pMemState_val,
size_t _pMemBandwidth_val_length,
zes_mem_bandwidth_t *pMemBandwidth_val) {
auto *data = static_cast<data_t *>(usr_data);
const auto it0 = data->memModule_property.find({hostname, vpid, hDevice, hMemModule});
if (it0 != data->memModule_property.cend()) {
// Get memModule properties: subdevice ID ...
auto subDevice = it0->second.subdeviceId;
// Insert the current bandwidth data with timestamp
auto [it, inserted] = data->device_memModule_ref.insert(
{{hostname, vpid, hDevice, hMemModule, subDevice}, {*pMemBandwidth_val, ts}});
if (inserted)
return;

// Previous bandwidth data
auto &[prev_bandwidth, prev_ts] = it->second;

if (pMemBandwidth_val->timestamp == prev_bandwidth.timestamp)
return;
// Calculate the RD and WT bandwidth
//https://spec.oneapi.io/level-zero/latest/sysman/api.html#_CPPv419zes_mem_bandwidth_t
double allocation = static_cast<double>(pMemState_val->size - pMemState_val->free) * 100.0 / static_cast<double>(pMemState_val->size);
double time_diff = static_cast<double>(pMemBandwidth_val->timestamp - prev_bandwidth.timestamp);
double rdBandwidth = static_cast<double>(pMemBandwidth_val->readCounter - prev_bandwidth.readCounter) * 1e6 / (time_diff * pMemBandwidth_val->maxBandwidth);
double wtBandwidth = static_cast<double>(pMemBandwidth_val->writeCounter - prev_bandwidth.writeCounter) * 1e6 / (time_diff * pMemBandwidth_val->maxBandwidth);
DeviceHash uuid_idx = get_device_hash(usr_data, hostname, vpid, hDevice);
btx_push_message_lttng_memModule(btx_handle, hostname, 0, 0, prev_ts, BACKEND_ZE,
uuid_idx.hash, uuid_idx.deviceIdx, (uint64_t)hMemModule, subDevice,
rdBandwidth, wtBandwidth, allocation);
// Update the stored values
it->second = {*pMemBandwidth_val, ts};
} else {
std::cerr << "Memory property not found!" << std::endl;
}
}

static void lttng_ust_ze_sampling_engineStats_callback(void *btx_handle, void *usr_data, int64_t ts,
const char *hostname, int64_t vpid,
uint64_t vtid, ze_device_handle_t hDevice,
Expand All @@ -842,7 +883,7 @@ static void lttng_ust_ze_sampling_engineStats_callback(void *btx_handle, void *u
const auto it0 = data->engine_property.find({hostname, vpid, hDevice, hEngine});
if (it0 != data->engine_property.cend()) {
const auto& engineProps = it0->second;
uint32_t subDevice = engineProps.subdeviceId; // (engineProps.onSubdevice) ? engineProps.subdeviceId : 0;
uint32_t subDevice = engineProps.subdeviceId;

if (engineProps.type == ZES_ENGINE_GROUP_COMPUTE_ALL || engineProps.type == ZES_ENGINE_GROUP_COPY_ALL) {
auto [it, inserted] = data->device_engines_ref.insert(
Expand Down Expand Up @@ -931,6 +972,16 @@ static void lttng_ust_ze_sampling_fabricPortProperties_callback(void *btx_handle
data->fabricPort_property[{hostname, vpid, (ze_device_handle_t)hDevice, (zes_fabric_port_handle_t)hFabricPort}] = *pFabricPortProperties_val;
}

static void lttng_ust_ze_sampling_memoryProperties_callback(void *btx_handle, void *usr_data, int64_t ts,
const char *hostname, int64_t vpid, uint64_t vtid,
ze_device_handle_t hDevice, zes_mem_handle_t hMemModule,
size_t _pMemModuleProperties_val_length,
zes_mem_properties_t *pMemModuleProperties_val) {
auto *data = static_cast<data_t *>(usr_data);
data->memModule_property[{hostname, vpid, (ze_device_handle_t)hDevice, (zes_mem_handle_t)hMemModule}] = *pMemModuleProperties_val;
}


static void lttng_ust_ze_sampling_powerProperties_callback(void *btx_handle, void *usr_data, int64_t ts,
const char *hostname, int64_t vpid, uint64_t vtid,
ze_device_handle_t hDevice, zes_pwr_handle_t hPower,
Expand Down Expand Up @@ -1070,8 +1121,12 @@ void btx_register_usr_callbacks(void *btx_handle) {
btx_handle, &lttng_ust_ze_sampling_engineProperties_callback);
btx_register_callbacks_lttng_ust_ze_sampling_freqProperties(
btx_handle, &lttng_ust_ze_sampling_freqProperties_callback);
btx_register_callbacks_lttng_ust_ze_sampling_memoryProperties(
btx_handle, &lttng_ust_ze_sampling_memoryProperties_callback);

// Telemetries
btx_register_callbacks_lttng_ust_ze_sampling_memStats(
btx_handle, &lttng_ust_ze_sampling_memStats_callback);
btx_register_callbacks_lttng_ust_ze_sampling_fabricPort(
btx_handle, &lttng_ust_ze_sampling_fabricPort_callback);
btx_register_callbacks_lttng_ust_ze_sampling_gpu_energy(
Expand Down
Loading

0 comments on commit 74d8005

Please sign in to comment.