Skip to content

Commit

Permalink
support per process utilization statistics and accounting support
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangruipeng committed Jul 25, 2019
1 parent 88719e5 commit c219612
Show file tree
Hide file tree
Showing 3 changed files with 282 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ go:
- "1.8"
- "1.9"
- "1.10"
- "1.11"
- "1.12"

script:
- make presubmit
226 changes: 226 additions & 0 deletions bindings.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,65 @@ nvmlReturn_t nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int*
return nvmlDeviceGetDecoderUtilizationFunc(device, utilization, samplingPeriodUs);
}
nvmlReturn_t (*nvmlSystemGetProcessNameFunc)(unsigned int pid, char *name, unsigned int length);
nvmlReturn_t nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length) {
if (nvmlSystemGetProcessNameFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return nvmlSystemGetProcessNameFunc(pid, name, length);
}
nvmlReturn_t (*nvmlDeviceGetAccountingModeFunc)(nvmlDevice_t device, nvmlEnableState_t *mode);
nvmlReturn_t nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode) {
if (nvmlDeviceGetAccountingModeFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return nvmlDeviceGetAccountingModeFunc(device, mode);
}
nvmlReturn_t (*nvmlDeviceSetAccountingModeFunc)(nvmlDevice_t device, nvmlEnableState_t mode);
nvmlReturn_t nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode) {
if (nvmlDeviceSetAccountingModeFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return nvmlDeviceSetAccountingModeFunc(device, mode);
}
nvmlReturn_t (*nvmlDeviceGetAccountingStatsFunc)(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats);
nvmlReturn_t nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats) {
if (nvmlDeviceGetAccountingStatsFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return nvmlDeviceGetAccountingStatsFunc(device, pid, stats);
}
nvmlReturn_t (*nvmlDeviceGetAccountingPidsFunc)(nvmlDevice_t device, unsigned int *count, unsigned int *pids);
nvmlReturn_t nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids) {
if (nvmlDeviceGetAccountingPidsFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return nvmlDeviceGetAccountingPidsFunc(device, count, pids);
}
nvmlReturn_t (*nvmlDeviceGetAccountingBufferSizeFunc)(nvmlDevice_t device, unsigned int* bufferSize);
nvmlReturn_t nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int* bufferSize) {
if (nvmlDeviceGetAccountingBufferSizeFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return nvmlDeviceGetAccountingBufferSizeFunc(device, bufferSize);
}
nvmlReturn_t (*nvmlDeviceGetProcessUtilizationFunc)(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization,
unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp);
nvmlReturn_t nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization,
unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp){
if (nvmlDeviceGetProcessUtilizationFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return nvmlDeviceGetProcessUtilizationFunc(device, utilization, processSamplesCount, lastSeenTimeStamp);
}
nvmlReturn_t (*nvmlDeviceGetSamplesFunc)(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);
// Loads the "libnvidia-ml.so.1" shared library.
Expand Down Expand Up @@ -221,6 +280,35 @@ nvmlReturn_t nvmlInit_dl(void) {
if (nvmlDeviceGetDecoderUtilizationFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
nvmlSystemGetProcessNameFunc = dlsym(nvmlHandle, "nvmlSystemGetProcessName");
if (nvmlSystemGetProcessNameFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
nvmlDeviceGetAccountingModeFunc = dlsym(nvmlHandle, "nvmlDeviceGetAccountingMode");
if (nvmlDeviceGetAccountingModeFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
nvmlDeviceSetAccountingModeFunc = dlsym(nvmlHandle, "nvmlDeviceSetAccountingMode");
if (nvmlDeviceSetAccountingModeFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
nvmlDeviceGetAccountingStatsFunc = dlsym(nvmlHandle, "nvmlDeviceGetAccountingStats");
if (nvmlDeviceGetAccountingStatsFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
nvmlDeviceGetAccountingPidsFunc = dlsym(nvmlHandle, "nvmlDeviceGetAccountingPids");
if (nvmlDeviceGetAccountingPidsFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
nvmlDeviceGetAccountingBufferSizeFunc = dlsym(nvmlHandle, "nvmlDeviceGetAccountingBufferSize");
if (nvmlDeviceGetAccountingBufferSizeFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
nvmlDeviceGetProcessUtilizationFunc = dlsym(nvmlHandle, "nvmlDeviceGetProcessUtilization");
if (nvmlDeviceGetProcessUtilizationFunc == NULL) {
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
nvmlReturn_t result = nvmlInitFunc();
if (result != NVML_SUCCESS) {
dlclose(nvmlHandle);
Expand Down Expand Up @@ -376,6 +464,16 @@ type Device struct {
dev C.nvmlDevice_t
}

// Utilization is Structure to store utilization value and process Id
type Utilization struct {
Pid uint //!< PID of process
timeStamp uint64 //!< CPU Timestamp in microseconds
SMUtil uint //!< SM (3D/Compute) Util Value
MemUtil uint //!< Frame Buffer Memory Util Value
EncUtil uint //!< Encoder Util Value
DecUtil uint //!< Decoder Util Value
}

// DeviceHandleByIndex returns the device handle for a particular index.
// The indices range from 0 to DeviceCount()-1. The order in which NVML
// enumerates devices has no guarantees of consistency between reboots.
Expand Down Expand Up @@ -522,3 +620,131 @@ func (d Device) DecoderUtilization() (uint, uint, error) {
r := C.nvmlDeviceGetDecoderUtilization(d.dev, &n, &sp)
return uint(n), uint(sp), errorString(r)
}

// DeviceGetAccountingMode Queries process's accounting stats
// @return mode Reference in which to return the current accounting mode
func (d Device) AccountingMode() (C.nvmlEnableState_t, error) {
var stats C.nvmlEnableState_t
if C.nvmlHandle == nil {
return stats, errLibraryNotLoaded
}
r := C.nvmlDeviceGetAccountingMode(d.dev, &stats)
return stats, errorString(r)
}

// DeviceSetAccountingMode Queries the state of per process accounting mode.
// @param enable Whether enable nvml's accounting mode
func (d Device) DeviceSetAccountingMode(enable bool) error {
if C.nvmlHandle == nil {
return errLibraryNotLoaded
}
var mode C.nvmlEnableState_t = C.NVML_FEATURE_DISABLED
if enable {
mode = C.NVML_FEATURE_ENABLED
}
r := C.nvmlDeviceSetAccountingMode(d.dev, mode)
return errorString(r)
}

// DeviceGetAccountingStats Queries process's accounting stats.
// @param pid Process Id of the target process to query stats for
// @return stats Reference in which to return the process's accounting stats
func (d Device) AccountingStats(pid uint) (C.nvmlAccountingStats_t, error) {
var stats C.nvmlAccountingStats_t
if C.nvmlHandle == nil {
return stats, errLibraryNotLoaded
}
r := C.nvmlDeviceGetAccountingStats(d.dev, C.uint(pid), &stats)
return stats, errorString(r)
}

// DeviceGetAccountingPids Queries list of processes that can be queried for accounting stats. The list of processes returned
// @param count Maxnum pids
// @return pids Pids result
// @return count Queried pids num
func (d Device) AccountingPids(count uint) ([]C.uint, uint, error) {
// init pids
cCount := C.uint(count)
if C.nvmlHandle == nil {
return nil, 0, errLibraryNotLoaded
}
if count == 0 {
r := C.nvmlDeviceGetAccountingPids(d.dev, &cCount, nil)
return nil, uint(cCount), errorString(r)
}

pids := make([]C.uint, count)
for index := range pids {
pids[index] = 0
}

r := C.nvmlDeviceGetAccountingPids(d.dev, &cCount, &pids[0])
return pids, uint(cCount), errorString(r)
}

// DeviceGetAccountingBufferSize Returns the number of processes that the circular buffer with accounting pids can hold.
// @return buffersize buffersize
func (d Device) AccountingBufferSize() (uint, error) {
if C.nvmlHandle == nil {
return 0, errLibraryNotLoaded
}
var bufferSize C.uint
r := C.nvmlDeviceGetAccountingBufferSize(d.dev, &bufferSize)
return uint(bufferSize), errorString(r)
}

// DeviceGetProcessUtilization Retrieves the current utilization and process ID
// @param processCount Maxnum process buffersize
// @param since The last query time for process
// @return utilizations The utilizations for all process
// @return processCount The queried utilizations
func (d Device) ProcessUtilization(processCount uint, since time.Duration) ([]*Utilization, error) {
if C.nvmlHandle == nil {
return nil, errLibraryNotLoaded
}
if processCount <= 0 {
return nil, errors.New("Process Count Less than zero")
}

cUtilizations := make([]C.nvmlProcessUtilizationSample_t, processCount)
var runningProcess C.uint = C.uint(processCount * C.sizeof_nvmlProcessUtilizationSample_t)

lastTS := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
r := C.nvmlDeviceGetProcessUtilization(d.dev, &cUtilizations[0], &runningProcess, lastTS)
if errorString(r) != nil {
return nil, errorString(r)
}

utilizations := make([]*Utilization, runningProcess)
utilCount := 0
for _, utilization := range cUtilizations[:runningProcess] {
if utilization.pid <= 0 {
continue
}
u := &Utilization{
Pid: uint(utilization.pid),
timeStamp: uint64(utilization.timeStamp),
SMUtil: uint(utilization.smUtil),
MemUtil: uint(utilization.memUtil),
EncUtil: uint(utilization.encUtil),
DecUtil: uint(utilization.decUtil),
}
utilizations[utilCount] = u
utilCount++
}

return utilizations[:utilCount], errorString(r)
}

// SystemGetProcessName GetProcessName by pid
// @param pid Process's id
// @param buffersize The process name's buffersize
// @return name Process name
func SystemGetProcessName(pid, buffersize uint) (string, error) {
if C.nvmlHandle == nil {
return "", errLibraryNotLoaded
}
c := make([]C.char, buffersize)
r := C.nvmlSystemGetProcessName(C.uint(pid), &c[0], C.uint(buffersize))
return C.GoString(&c[0]), errorString(r)
}
54 changes: 54 additions & 0 deletions cmd/example/example.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,60 @@ func main() {
return
}
fmt.Printf("\tutilization.decoder: %d\n", decoderUtilization)

modeStats, err := dev.AccountingMode()
if err != nil {
fmt.Printf("\tdev.DeviceGetAccountingMode() error: %v\n", err)
return
}
fmt.Printf("\taccounting.mode enable: %v\n", modeStats)

bufferSize, err := dev.AccountingBufferSize()
if err != nil {
fmt.Printf("\tdev.DeviceGetAccountingBufferSize() error: %v\n", err)
return
}
fmt.Printf("\taccounting.buffersize: %d\n", bufferSize)

pids, count, err := dev.AccountingPids(bufferSize)
if err != nil {
fmt.Printf("\tdev.DeviceGetAccountingPids() error: %v\n", err)
} else {
fmt.Printf("\taccounting.pids.count: %v\n", count)
for _, pid := range pids[:count] {
fmt.Printf("\t\tPid: %v", pid)
stats, err := dev.AccountingStats(uint(pid))
if err != nil {
fmt.Printf("\tdev.DeviceGetAccountingStats() error: %v\n", err)
} else {
fmt.Printf(", stats: %s\n", stats)
}
}
}

utilizations, err := dev.ProcessUtilization(10, 10*time.Second)
if err != nil {
fmt.Printf("\tdev.DeviceGetProcessUtilization() error: %v\n", err)
} else {
fmt.Printf("\tProcess count: %v\n", len(utilizations))

utilizations = utilizations
for _, sample := range utilizations {
fmt.Printf("\t\tProcess: %v", sample.Pid)
fmt.Printf(", SM util: %v", sample.SMUtil)
fmt.Printf(", Mem util: %v", sample.MemUtil)
fmt.Printf(", Enc util: %v", sample.EncUtil)
fmt.Printf(", Dec util: %v", sample.DecUtil)

name, err := gonvml.SystemGetProcessName(sample.Pid, 64)
if err != nil {
fmt.Printf("\n\tdev.SystemGetProcessName() error: %v\n", err)
} else {
fmt.Printf(", Name: %s\n", name)
}
}
}

fmt.Println()
}
}

0 comments on commit c219612

Please sign in to comment.