Skip to content

Commit

Permalink
fix(api/libnvml): fix upstream changes for process info v3 APIs on 53…
Browse files Browse the repository at this point in the history
…5.104.05 driver (#94)
  • Loading branch information
XuehaiPan committed Aug 26, 2023
1 parent 6a9663b commit 9ff3ec3
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 76 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ repos:
- id: debug-statements
- id: double-quote-string-fixer
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.284
rev: v0.0.286
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- Fix upstream changes for process info v3 APIs on 535.104.05 driver by [@XuehaiPan](https://github.com/XuehaiPan) in [#94](https://github.com/XuehaiPan/nvitop/pull/94).
- Fix removal for process info v3 APIs on the upstream 535.98 driver by [@XuehaiPan](https://github.com/XuehaiPan) in [#89](https://github.com/XuehaiPan/nvitop/pull/89).

### Removed
Expand Down
1 change: 0 additions & 1 deletion nvitop/api/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -2111,7 +2111,6 @@ def processes(self) -> dict[int, GpuProcess]:
gpu_memory=gpu_memory,
gpu_instance_id=getattr(p, 'gpuInstanceId', UINT_MAX),
compute_instance_id=getattr(p, 'computeInstanceId', UINT_MAX),
gpu_cc_protected_memory=getattr(p, 'usedGpuCcProtectedMemory', NA),
)
proc.type = proc.type + type

Expand Down
70 changes: 25 additions & 45 deletions nvitop/api/libnvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,66 +595,48 @@ def __determine_get_running_processes_version_suffix() -> str:
_nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
__get_running_processes_version_suffix = '_v3'

def lookup(symbol: str) -> _Any:
def lookup(symbol: str) -> _Any | None:
try:
ptr = _nvmlGetFunctionPointer(symbol)
except NVMLError_FunctionNotFound:
LOGGER.debug('Failed to found symbol `%s`.', symbol)
raise
return None
LOGGER.debug('Found symbol `%s`.', symbol)
return ptr

try:
lookup('nvmlDeviceGetConfComputeMemSizeInfo')
except NVMLError_FunctionNotFound:
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
LOGGER.debug(
'NVML get running process version 3 API with v3 type struct is not available '
'due to incompatible NVIDIA driver. Fallback to use get running process '
'version 3 API with v2 type struct.',
)
try:
lookup('nvmlDeviceGetComputeRunningProcesses_v3')
except NVMLError_FunctionNotFound:
__get_running_processes_version_suffix = '_v2'
if lookup('nvmlDeviceGetComputeRunningProcesses_v3'):
if lookup('nvmlDeviceGetConfComputeMemSizeInfo') and not lookup(
'nvmlDeviceGetRunningProcessDetailList',
):
LOGGER.debug(
'NVML get running process version 3 API with v2 type struct is not '
'available due to incompatible NVIDIA driver. Fallback to use get running '
'process version 2 API with v2 type struct.',
'NVML get running process version 3 API with v3 type struct is available.',
)
try:
lookup('nvmlDeviceGetComputeRunningProcesses_v2')
except NVMLError_FunctionNotFound:
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v1_t
__get_running_processes_version_suffix = ''
LOGGER.debug(
'NVML get running process version 2 API with v2 type struct is not '
'available due to incompatible NVIDIA driver. Fallback to use get '
'running process version 1 API with v1 type struct.',
)
else:
LOGGER.debug(
'NVML get running process version 2 API with v2 type struct is '
'available.',
)
else:
LOGGER.debug(
'NVML get running process version 3 API with v2 type struct is available.',
)
else:
try:
lookup('nvmlDeviceGetComputeRunningProcesses_v3')
except NVMLError_FunctionNotFound:
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
__get_running_processes_version_suffix = '_v2'
LOGGER.debug(
'NVML get running process version 3 API with v3 type struct is not '
'available due to incompatible NVIDIA driver. Fallback to use get running '
'process version 2 API with v2 type struct.',
'process version 3 API with v2 type struct.',
)
else:
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
__get_running_processes_version_suffix = '_v2'
LOGGER.debug(
'NVML get running process version 3 API with v3 type struct is not available '
'due to incompatible NVIDIA driver. Fallback to use get running process '
'version 2 API with v2 type struct.',
)
if lookup('nvmlDeviceGetComputeRunningProcesses_v2'):
LOGGER.debug(
'NVML get running process version 2 API with v2 type struct is available.',
)
else:
c_nvmlProcessInfo_t = c_nvmlProcessInfo_v1_t
__get_running_processes_version_suffix = ''
LOGGER.debug(
'NVML get running process version 3 API with v3 type struct is available.',
'NVML get running process version 2 API with v2 type struct is not '
'available due to incompatible NVIDIA driver. Fallback to use get '
'running process version 1 API with v1 type struct.',
)

return __get_running_processes_version_suffix
Expand Down Expand Up @@ -697,8 +679,6 @@ def __nvml_device_get_running_processes(
if obj.usedGpuMemory == ULONGLONG_MAX:
# Special case for WDDM on Windows, see comment above
obj.usedGpuMemory = None
if getattr(obj, 'usedGpuCcProtectedMemory', None) == ULONGLONG_MAX:
obj.usedGpuCcProtectedMemory = None
processes.append(obj)

return processes
Expand Down
29 changes: 0 additions & 29 deletions nvitop/api/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,6 @@ def __new__(
gpu_memory: int | NaType | None = None,
gpu_instance_id: int | NaType | None = None,
compute_instance_id: int | NaType | None = None,
gpu_cc_protected_memory: int | NaType | None = None,
type: str | NaType | None = None, # pylint: disable=redefined-builtin
# pylint: enable=unused-argument
) -> Self:
Expand Down Expand Up @@ -509,7 +508,6 @@ def __init__(
gpu_memory: int | NaType | None = None,
gpu_instance_id: int | NaType | None = None,
compute_instance_id: int | NaType | None = None,
gpu_cc_protected_memory: int | NaType | None = None,
type: str | NaType | None = None, # pylint: disable=redefined-builtin
) -> None:
"""Initialize the instance returned by :meth:`__new__()`."""
Expand All @@ -534,14 +532,6 @@ def __init__(
else:
self._gpu_instance_id = self._compute_instance_id = NA

if gpu_cc_protected_memory is None and not hasattr(
self,
'_gpu_cc_protected_memory',
):
gpu_cc_protected_memory = NA
if gpu_cc_protected_memory is not None:
self.set_gpu_cc_protected_memory(gpu_cc_protected_memory)

for util in ('sm', 'memory', 'encoder', 'decoder'):
if not hasattr(self, f'_gpu_{util}_utilization'):
setattr(self, f'_gpu_{util}_utilization', NA)
Expand Down Expand Up @@ -631,15 +621,6 @@ def gpu_memory_percent(self) -> float | NaType: # in percentage
"""The percentage of used GPU memory by the process, or :const:`nvitop.NA` if not applicable."""
return self._gpu_memory_percent

def gpu_cc_protected_memory(self) -> int | NaType: # in bytes
"""The used GPU conf compute protected memory in bytes, or :const:`nvitop.NA` if not applicable."""
return self._gpu_cc_protected_memory

def gpu_cc_protected_memory_human(self) -> str | NaType: # in human readable
# pylint: disable-next=line-too-long
"""The used GPU conf compute protected memory in human readable format, or :const:`nvitop.NA` if not applicable."""
return self._gpu_cc_protected_memory_human

def gpu_sm_utilization(self) -> int | NaType: # in percentage
"""The utilization rate of SM (Streaming Multiprocessor), or :const:`nvitop.NA` if not applicable."""
return self._gpu_sm_utilization
Expand Down Expand Up @@ -667,12 +648,6 @@ def set_gpu_memory(self, value: int | NaType) -> None:
gpu_memory_percent = round(100.0 * memory_used / memory_total, 1) # type: ignore[assignment]
self._gpu_memory_percent = gpu_memory_percent

def set_gpu_cc_protected_memory(self, value: int | NaType) -> None:
"""Set the used GPU conf compute protected memory in bytes."""
# pylint: disable=attribute-defined-outside-init
self._gpu_cc_protected_memory = value
self._gpu_cc_protected_memory_human = bytes2human(self.gpu_cc_protected_memory())

def set_gpu_utilization(
self,
gpu_sm_utilization: int | NaType | None = None,
Expand All @@ -694,15 +669,13 @@ def set_gpu_utilization(
def update_gpu_status(self) -> int | NaType:
"""Update the GPU consumption status from a new NVML query."""
self.set_gpu_memory(NA)
self.set_gpu_cc_protected_memory(NA)
self.set_gpu_utilization(NA, NA, NA, NA)
processes = self.device.processes()
process = processes.get(self.pid, self)
if process is not self:
# The current process is gone and the instance has been removed from the cache.
# Update GPU status from the new instance.
self.set_gpu_memory(process.gpu_memory())
self.set_gpu_cc_protected_memory(process.gpu_cc_protected_memory())
self.set_gpu_utilization(
process.gpu_sm_utilization(),
process.gpu_memory_utilization(),
Expand Down Expand Up @@ -1031,8 +1004,6 @@ def as_snapshot(
gpu_memory_utilization=self.gpu_memory_utilization(),
gpu_encoder_utilization=self.gpu_encoder_utilization(),
gpu_decoder_utilization=self.gpu_decoder_utilization(),
gpu_cc_protected_memory=self.gpu_cc_protected_memory(),
gpu_cc_protected_memory_human=self.gpu_cc_protected_memory_human(),
)

@classmethod
Expand Down

0 comments on commit 9ff3ec3

Please sign in to comment.