From c21961252220950995f86a85e46b884dbed02f9a Mon Sep 17 00:00:00 2001 From: zhangruipeng Date: Wed, 24 Jul 2019 21:37:48 +0800 Subject: [PATCH] support per process utilization statistics and accounting support --- .travis.yml | 2 + bindings.go | 226 +++++++++++++++++++++++++++++++++++++++++ cmd/example/example.go | 54 ++++++++++ 3 files changed, 282 insertions(+) diff --git a/.travis.yml b/.travis.yml index d45dc3c..00df499 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,8 @@ go: - "1.8" - "1.9" - "1.10" + - "1.11" + - "1.12" script: - make presubmit diff --git a/bindings.go b/bindings.go index abefe83..3ca112b 100644 --- a/bindings.go +++ b/bindings.go @@ -143,6 +143,65 @@ nvmlReturn_t nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int* return nvmlDeviceGetDecoderUtilizationFunc(device, utilization, samplingPeriodUs); } +nvmlReturn_t (*nvmlSystemGetProcessNameFunc)(unsigned int pid, char *name, unsigned int length); +nvmlReturn_t nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length) { + if (nvmlSystemGetProcessNameFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return nvmlSystemGetProcessNameFunc(pid, name, length); +} + +nvmlReturn_t (*nvmlDeviceGetAccountingModeFunc)(nvmlDevice_t device, nvmlEnableState_t *mode); +nvmlReturn_t nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode) { + if (nvmlDeviceGetAccountingModeFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return nvmlDeviceGetAccountingModeFunc(device, mode); +} + +nvmlReturn_t (*nvmlDeviceSetAccountingModeFunc)(nvmlDevice_t device, nvmlEnableState_t mode); +nvmlReturn_t nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode) { + if (nvmlDeviceSetAccountingModeFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return nvmlDeviceSetAccountingModeFunc(device, mode); +} + +nvmlReturn_t (*nvmlDeviceGetAccountingStatsFunc)(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); +nvmlReturn_t nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats) { + if (nvmlDeviceGetAccountingStatsFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return nvmlDeviceGetAccountingStatsFunc(device, pid, stats); +} + + +nvmlReturn_t (*nvmlDeviceGetAccountingPidsFunc)(nvmlDevice_t device, unsigned int *count, unsigned int *pids); +nvmlReturn_t nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids) { + if (nvmlDeviceGetAccountingPidsFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return nvmlDeviceGetAccountingPidsFunc(device, count, pids); +} + +nvmlReturn_t (*nvmlDeviceGetAccountingBufferSizeFunc)(nvmlDevice_t device, unsigned int* bufferSize); +nvmlReturn_t nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int* bufferSize) { + if (nvmlDeviceGetAccountingBufferSizeFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return nvmlDeviceGetAccountingBufferSizeFunc(device, bufferSize); +} + +nvmlReturn_t (*nvmlDeviceGetProcessUtilizationFunc)(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); +nvmlReturn_t nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp){ + if (nvmlDeviceGetProcessUtilizationFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return nvmlDeviceGetProcessUtilizationFunc(device, utilization, processSamplesCount, lastSeenTimeStamp); +} + nvmlReturn_t (*nvmlDeviceGetSamplesFunc)(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); // Loads the "libnvidia-ml.so.1" shared library. @@ -221,6 +280,35 @@ nvmlReturn_t nvmlInit_dl(void) { if (nvmlDeviceGetDecoderUtilizationFunc == NULL) { return NVML_ERROR_FUNCTION_NOT_FOUND; } + nvmlSystemGetProcessNameFunc = dlsym(nvmlHandle, "nvmlSystemGetProcessName"); + if (nvmlSystemGetProcessNameFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + nvmlDeviceGetAccountingModeFunc = dlsym(nvmlHandle, "nvmlDeviceGetAccountingMode"); + if (nvmlDeviceGetAccountingModeFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + nvmlDeviceSetAccountingModeFunc = dlsym(nvmlHandle, "nvmlDeviceSetAccountingMode"); + if (nvmlDeviceSetAccountingModeFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + nvmlDeviceGetAccountingStatsFunc = dlsym(nvmlHandle, "nvmlDeviceGetAccountingStats"); + if (nvmlDeviceGetAccountingStatsFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + nvmlDeviceGetAccountingPidsFunc = dlsym(nvmlHandle, "nvmlDeviceGetAccountingPids"); + if (nvmlDeviceGetAccountingPidsFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + nvmlDeviceGetAccountingBufferSizeFunc = dlsym(nvmlHandle, "nvmlDeviceGetAccountingBufferSize"); + if (nvmlDeviceGetAccountingBufferSizeFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + nvmlDeviceGetProcessUtilizationFunc = dlsym(nvmlHandle, "nvmlDeviceGetProcessUtilization"); + if (nvmlDeviceGetProcessUtilizationFunc == NULL) { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + nvmlReturn_t result = nvmlInitFunc(); if (result != NVML_SUCCESS) { dlclose(nvmlHandle); @@ -376,6 +464,16 @@ type Device struct { dev C.nvmlDevice_t } +// Utilization is Structure to store utilization value and process Id +type Utilization struct { + Pid uint //!< PID of process + timeStamp uint64 //!< CPU Timestamp in microseconds + SMUtil uint //!< SM (3D/Compute) Util Value + MemUtil uint //!< Frame Buffer Memory Util Value + EncUtil uint //!< Encoder Util Value + DecUtil uint //!< Decoder Util Value +} + // DeviceHandleByIndex returns the device handle for a particular index. // The indices range from 0 to DeviceCount()-1. The order in which NVML // enumerates devices has no guarantees of consistency between reboots. @@ -522,3 +620,131 @@ func (d Device) DecoderUtilization() (uint, uint, error) { r := C.nvmlDeviceGetDecoderUtilization(d.dev, &n, &sp) return uint(n), uint(sp), errorString(r) } + +// DeviceGetAccountingMode Queries process's accounting stats +// @return mode Reference in which to return the current accounting mode +func (d Device) AccountingMode() (C.nvmlEnableState_t, error) { + var stats C.nvmlEnableState_t + if C.nvmlHandle == nil { + return stats, errLibraryNotLoaded + } + r := C.nvmlDeviceGetAccountingMode(d.dev, &stats) + return stats, errorString(r) +} + +// DeviceSetAccountingMode Queries the state of per process accounting mode. +// @param enable Whether enable nvml's accounting mode +func (d Device) DeviceSetAccountingMode(enable bool) error { + if C.nvmlHandle == nil { + return errLibraryNotLoaded + } + var mode C.nvmlEnableState_t = C.NVML_FEATURE_DISABLED + if enable { + mode = C.NVML_FEATURE_ENABLED + } + r := C.nvmlDeviceSetAccountingMode(d.dev, mode) + return errorString(r) +} + +// DeviceGetAccountingStats Queries process's accounting stats. +// @param pid Process Id of the target process to query stats for +// @return stats Reference in which to return the process's accounting stats +func (d Device) AccountingStats(pid uint) (C.nvmlAccountingStats_t, error) { + var stats C.nvmlAccountingStats_t + if C.nvmlHandle == nil { + return stats, errLibraryNotLoaded + } + r := C.nvmlDeviceGetAccountingStats(d.dev, C.uint(pid), &stats) + return stats, errorString(r) +} + +// DeviceGetAccountingPids Queries list of processes that can be queried for accounting stats. The list of processes returned +// @param count Maxnum pids +// @return pids Pids result +// @return count Queried pids num +func (d Device) AccountingPids(count uint) ([]C.uint, uint, error) { + // init pids + cCount := C.uint(count) + if C.nvmlHandle == nil { + return nil, 0, errLibraryNotLoaded + } + if count == 0 { + r := C.nvmlDeviceGetAccountingPids(d.dev, &cCount, nil) + return nil, uint(cCount), errorString(r) + } + + pids := make([]C.uint, count) + for index := range pids { + pids[index] = 0 + } + + r := C.nvmlDeviceGetAccountingPids(d.dev, &cCount, &pids[0]) + return pids, uint(cCount), errorString(r) +} + +// DeviceGetAccountingBufferSize Returns the number of processes that the circular buffer with accounting pids can hold. +// @return buffersize buffersize +func (d Device) AccountingBufferSize() (uint, error) { + if C.nvmlHandle == nil { + return 0, errLibraryNotLoaded + } + var bufferSize C.uint + r := C.nvmlDeviceGetAccountingBufferSize(d.dev, &bufferSize) + return uint(bufferSize), errorString(r) +} + +// DeviceGetProcessUtilization Retrieves the current utilization and process ID +// @param processCount Maxnum process buffersize +// @param since The last query time for process +// @return utilizations The utilizations for all process +// @return processCount The queried utilizations +func (d Device) ProcessUtilization(processCount uint, since time.Duration) ([]*Utilization, error) { + if C.nvmlHandle == nil { + return nil, errLibraryNotLoaded + } + if processCount <= 0 { + return nil, errors.New("Process Count Less than zero") + } + + cUtilizations := make([]C.nvmlProcessUtilizationSample_t, processCount) + var runningProcess C.uint = C.uint(processCount * C.sizeof_nvmlProcessUtilizationSample_t) + + lastTS := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000) + r := C.nvmlDeviceGetProcessUtilization(d.dev, &cUtilizations[0], &runningProcess, lastTS) + if errorString(r) != nil { + return nil, errorString(r) + } + + utilizations := make([]*Utilization, runningProcess) + utilCount := 0 + for _, utilization := range cUtilizations[:runningProcess] { + if utilization.pid <= 0 { + continue + } + u := &Utilization{ + Pid: uint(utilization.pid), + timeStamp: uint64(utilization.timeStamp), + SMUtil: uint(utilization.smUtil), + MemUtil: uint(utilization.memUtil), + EncUtil: uint(utilization.encUtil), + DecUtil: uint(utilization.decUtil), + } + utilizations[utilCount] = u + utilCount++ + } + + return utilizations[:utilCount], errorString(r) +} + +// SystemGetProcessName GetProcessName by pid +// @param pid Process's id +// @param buffersize The process name's buffersize +// @return name Process name +func SystemGetProcessName(pid, buffersize uint) (string, error) { + if C.nvmlHandle == nil { + return "", errLibraryNotLoaded + } + c := make([]C.char, buffersize) + r := C.nvmlSystemGetProcessName(C.uint(pid), &c[0], C.uint(buffersize)) + return C.GoString(&c[0]), errorString(r) +} diff --git a/cmd/example/example.go b/cmd/example/example.go index dbc061e..0c7adec 100644 --- a/cmd/example/example.go +++ b/cmd/example/example.go @@ -137,6 +137,60 @@ func main() { return } fmt.Printf("\tutilization.decoder: %d\n", decoderUtilization) + + modeStats, err := dev.AccountingMode() + if err != nil { + fmt.Printf("\tdev.DeviceGetAccountingMode() error: %v\n", err) + return + } + fmt.Printf("\taccounting.mode enable: %v\n", modeStats) + + bufferSize, err := dev.AccountingBufferSize() + if err != nil { + fmt.Printf("\tdev.DeviceGetAccountingBufferSize() error: %v\n", err) + return + } + fmt.Printf("\taccounting.buffersize: %d\n", bufferSize) + + pids, count, err := dev.AccountingPids(bufferSize) + if err != nil { + fmt.Printf("\tdev.DeviceGetAccountingPids() error: %v\n", err) + } else { + fmt.Printf("\taccounting.pids.count: %v\n", count) + for _, pid := range pids[:count] { + fmt.Printf("\t\tPid: %v", pid) + stats, err := dev.AccountingStats(uint(pid)) + if err != nil { + fmt.Printf("\tdev.DeviceGetAccountingStats() error: %v\n", err) + } else { + fmt.Printf(", stats: %s\n", stats) + } + } + } + + utilizations, err := dev.ProcessUtilization(10, 10*time.Second) + if err != nil { + fmt.Printf("\tdev.DeviceGetProcessUtilization() error: %v\n", err) + } else { + fmt.Printf("\tProcess count: %v\n", len(utilizations)) + + utilizations = utilizations + for _, sample := range utilizations { + fmt.Printf("\t\tProcess: %v", sample.Pid) + fmt.Printf(", SM util: %v", sample.SMUtil) + fmt.Printf(", Mem util: %v", sample.MemUtil) + fmt.Printf(", Enc util: %v", sample.EncUtil) + fmt.Printf(", Dec util: %v", sample.DecUtil) + + name, err := gonvml.SystemGetProcessName(sample.Pid, 64) + if err != nil { + fmt.Printf("\n\tdev.SystemGetProcessName() error: %v\n", err) + } else { + fmt.Printf(", Name: %s\n", name) + } + } + } + fmt.Println() } }