Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support HMM profiling event #96

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions include/rocm_smi/kfd_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -547,9 +547,47 @@ enum kfd_smi_event {
KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
KFD_SMI_EVENT_GPU_PRE_RESET = 3,
KFD_SMI_EVENT_GPU_POST_RESET = 4,
KFD_SMI_EVENT_MIGRATE_START = 5,
KFD_SMI_EVENT_MIGRATE_END = 6,
KFD_SMI_EVENT_PAGE_FAULT_START = 7,
KFD_SMI_EVENT_PAGE_FAULT_END = 8,
KFD_SMI_EVENT_QUEUE_EVICTION = 9,
KFD_SMI_EVENT_QUEUE_RESTORE = 10,
KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,

/*
* max event number, as a flag bit to get events from all processes,
* this requires super user permission, otherwise will not be able to
* receive event from any process. Without this flag to receive events
* from same process.
*/
KFD_SMI_EVENT_ALL_PROCESS = 64
};

#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
#define KFD_SMI_EVENT_MSG_SIZE 96

enum KFD_MIGRATE_TRIGGERS {
KFD_MIGRATE_TRIGGER_PREFETCH = 1,
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
KFD_MIGRATE_TRIGGER_TTM_EVICTION
};

enum KFD_QUEUE_EVICTION_TRIGGERS {
KFD_QUEUE_EVICTION_TRIGGER_SVM = 1,
KFD_QUEUE_EVICTION_TRIGGER_USERPTR,
KFD_QUEUE_EVICTION_TRIGGER_TTM,
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,
KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,
KFD_QUEUE_EVICTION_CRIU_RESTORE
};

enum KFD_SVM_UNMAP_TRIGGERS {
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY = 1,
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
};

struct kfd_ioctl_smi_events_args {
__u32 gpuid; /* to KFD */
Expand Down
43 changes: 36 additions & 7 deletions include/rocm_smi/rocm_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,13 +304,20 @@ typedef struct {
* Event notification event types
*/
typedef enum {
RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault
RSMI_EVT_NOTIF_VMFAULT = 1, //!< VM page fault
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT,
RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE,
RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET,
RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET,

RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2,
RSMI_EVT_NOTIF_GPU_PRE_RESET = 3,
RSMI_EVT_NOTIF_GPU_POST_RESET = 4,
RSMI_EVT_NOTIF_MIGRATE_START = 5,
RSMI_EVT_NOTIF_MIGRATE_END = 6,
RSMI_EVT_NOTIF_PAGE_FAULT_START = 7,
RSMI_EVT_NOTIF_PAGE_FAULT_END = 8,
RSMI_EVT_NOTIF_QUEUE_EVICTION = 9,
RSMI_EVT_NOTIF_QUEUE_RESTORE = 10,
RSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11,
RSMI_EVT_NOTIF_ALL_PROCESS = 64,
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_UNMAP_FROM_GPU
} rsmi_evt_notification_type_t;

/**
Expand All @@ -319,7 +326,29 @@ typedef enum {
#define RSMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))

//! Maximum number of characters an event notification message will be
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 64
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96

typedef enum {
RSMI_EVT_NOTIF_MIGRATE_TRIGGER_PREFETCH = 1,
RSMI_EVT_NOTIF_MIGRATE_TRIGGER_PAGEFAULT_GPU,
RSMI_EVT_NOTIF_MIGRATE_TRIGGER_PAGEFAULT_CPU,
RSMI_EVT_NOTIF_MIGRATE_TRIGGER_TTM_EVICTION
} rsmi_evt_notification_migrate_trigger_type_t;
bill-shuzhou-liu marked this conversation as resolved.
Show resolved Hide resolved

typedef enum {
RSMI_EVT_NOTIF_QUEUE_EVICTION_TRIGGER_SVM = 1,
RSMI_EVT_NOTIF_QUEUE_EVICTION_TRIGGER_USERPTR,
RSMI_EVT_NOTIF_QUEUE_EVICTION_TRIGGER_TTM,
RSMI_EVT_NOTIF_QUEUE_EVICTION_TRIGGER_SUSPEND,
RSMI_EVT_NOTIF_QUEUE_EVICTION_CRIU_CHECKPOINT,
RSMI_EVT_NOTIF_QUEUE_EVICTION_CRIU_RESTORE
} rsmi_evt_notification_queue_eviction_trigger_type_t;

typedef enum {
RSMI_EVT_NOTIF_SVM_UNMAP_TRIGGER_MMU_NOTIFY = 1,
RSMI_EVT_NOTIF_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
RSMI_EVT_NOTIF_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
} rsmi_evt_notification_svm_unmap_trigger_type_t;

/**
* Event notification data returned from event notification API
Expand Down
5 changes: 4 additions & 1 deletion src/rocm_smi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4013,7 +4013,10 @@ rsmi_event_notification_get(int timeout_ms,
reinterpret_cast<rsmi_evt_notification_data_t *>(&data[*num_elem]);

uint32_t event;
while (fscanf(anon_fp, "%x %63s\n", &event,
#define __LEN__(X) #X
#define LEN(X) __LEN__(X)

while (fscanf(anon_fp, "%x %" LEN(MAX_EVENT_NOTIFICATION_MSG_SIZE)"[^\n]\n", &event,
bill-shuzhou-liu marked this conversation as resolved.
Show resolved Hide resolved
reinterpret_cast<char *>(&data_item->message)) == 2) {
/* Output is in format as "event information\n"
* Both event are expressed in hex.
Expand Down