fix(api/libnvml): fix upstream changes for process info v3 APIs on 53…

…5.104.05 driver (#94)
XuehaiPan · Aug 26, 2023 · 9ff3ec3 · 9ff3ec3
1 parent 6a9663b
commit 9ff3ec3
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 76 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
       - id: debug-statements
       - id: double-quote-string-fixer
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.284
+    rev: v0.0.286
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fix upstream changes for process info v3 APIs on 535.104.05 driver by [@XuehaiPan](https://github.com/XuehaiPan) in [#94](https://github.com/XuehaiPan/nvitop/pull/94).
 - Fix removal for process info v3 APIs on the upstream 535.98 driver by [@XuehaiPan](https://github.com/XuehaiPan) in [#89](https://github.com/XuehaiPan/nvitop/pull/89).
 
 ### Removed

diff --git a/nvitop/api/device.py b/nvitop/api/device.py
@@ -2111,7 +2111,6 @@ def processes(self) -> dict[int, GpuProcess]:
                     gpu_memory=gpu_memory,
                     gpu_instance_id=getattr(p, 'gpuInstanceId', UINT_MAX),
                     compute_instance_id=getattr(p, 'computeInstanceId', UINT_MAX),
-                    gpu_cc_protected_memory=getattr(p, 'usedGpuCcProtectedMemory', NA),
                 )
                 proc.type = proc.type + type
 

diff --git a/nvitop/api/libnvml.py b/nvitop/api/libnvml.py
@@ -595,66 +595,48 @@ def __determine_get_running_processes_version_suffix() -> str:
             _nvmlGetFunctionPointer = _pynvml._nvmlGetFunctionPointer
             __get_running_processes_version_suffix = '_v3'
 
-            def lookup(symbol: str) -> _Any:
+            def lookup(symbol: str) -> _Any | None:
                 try:
                     ptr = _nvmlGetFunctionPointer(symbol)
                 except NVMLError_FunctionNotFound:
                     LOGGER.debug('Failed to found symbol `%s`.', symbol)
-                    raise
+                    return None
                 LOGGER.debug('Found symbol `%s`.', symbol)
                 return ptr
 
-            try:
-                lookup('nvmlDeviceGetConfComputeMemSizeInfo')
-            except NVMLError_FunctionNotFound:
-                c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
-                LOGGER.debug(
-                    'NVML get running process version 3 API with v3 type struct is not available '
-                    'due to incompatible NVIDIA driver. Fallback to use get running process '
-                    'version 3 API with v2 type struct.',
-                )
-                try:
-                    lookup('nvmlDeviceGetComputeRunningProcesses_v3')
-                except NVMLError_FunctionNotFound:
-                    __get_running_processes_version_suffix = '_v2'
+            if lookup('nvmlDeviceGetComputeRunningProcesses_v3'):
+                if lookup('nvmlDeviceGetConfComputeMemSizeInfo') and not lookup(
+                    'nvmlDeviceGetRunningProcessDetailList',
+                ):
                     LOGGER.debug(
-                        'NVML get running process version 3 API with v2 type struct is not '
-                        'available due to incompatible NVIDIA driver. Fallback to use get running '
-                        'process version 2 API with v2 type struct.',
+                        'NVML get running process version 3 API with v3 type struct is available.',
                     )
-                    try:
-                        lookup('nvmlDeviceGetComputeRunningProcesses_v2')
-                    except NVMLError_FunctionNotFound:
-                        c_nvmlProcessInfo_t = c_nvmlProcessInfo_v1_t
-                        __get_running_processes_version_suffix = ''
-                        LOGGER.debug(
-                            'NVML get running process version 2 API with v2 type struct is not '
-                            'available due to incompatible NVIDIA driver. Fallback to use get '
-                            'running process version 1 API with v1 type struct.',
-                        )
-                    else:
-                        LOGGER.debug(
-                            'NVML get running process version 2 API with v2 type struct is '
-                            'available.',
-                        )
                 else:
-                    LOGGER.debug(
-                        'NVML get running process version 3 API with v2 type struct is available.',
-                    )
-            else:
-                try:
-                    lookup('nvmlDeviceGetComputeRunningProcesses_v3')
-                except NVMLError_FunctionNotFound:
                     c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
-                    __get_running_processes_version_suffix = '_v2'
                     LOGGER.debug(
                         'NVML get running process version 3 API with v3 type struct is not '
                         'available due to incompatible NVIDIA driver. Fallback to use get running '
-                        'process version 2 API with v2 type struct.',
+                        'process version 3 API with v2 type struct.',
+                    )
+            else:
+                c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t
+                __get_running_processes_version_suffix = '_v2'
+                LOGGER.debug(
+                    'NVML get running process version 3 API with v3 type struct is not available '
+                    'due to incompatible NVIDIA driver. Fallback to use get running process '
+                    'version 2 API with v2 type struct.',
+                )
+                if lookup('nvmlDeviceGetComputeRunningProcesses_v2'):
+                    LOGGER.debug(
+                        'NVML get running process version 2 API with v2 type struct is available.',
                     )
                 else:
+                    c_nvmlProcessInfo_t = c_nvmlProcessInfo_v1_t
+                    __get_running_processes_version_suffix = ''
                     LOGGER.debug(
-                        'NVML get running process version 3 API with v3 type struct is available.',
+                        'NVML get running process version 2 API with v2 type struct is not '
+                        'available due to incompatible NVIDIA driver. Fallback to use get '
+                        'running process version 1 API with v1 type struct.',
                     )
 
         return __get_running_processes_version_suffix
@@ -697,8 +679,6 @@ def __nvml_device_get_running_processes(
                 if obj.usedGpuMemory == ULONGLONG_MAX:
                     # Special case for WDDM on Windows, see comment above
                     obj.usedGpuMemory = None
-                if getattr(obj, 'usedGpuCcProtectedMemory', None) == ULONGLONG_MAX:
-                    obj.usedGpuCcProtectedMemory = None
                 processes.append(obj)
 
             return processes

diff --git a/nvitop/api/process.py b/nvitop/api/process.py
@@ -470,7 +470,6 @@ def __new__(
         gpu_memory: int | NaType | None = None,
         gpu_instance_id: int | NaType | None = None,
         compute_instance_id: int | NaType | None = None,
-        gpu_cc_protected_memory: int | NaType | None = None,
         type: str | NaType | None = None,  # pylint: disable=redefined-builtin
         # pylint: enable=unused-argument
     ) -> Self:
@@ -509,7 +508,6 @@ def __init__(
         gpu_memory: int | NaType | None = None,
         gpu_instance_id: int | NaType | None = None,
         compute_instance_id: int | NaType | None = None,
-        gpu_cc_protected_memory: int | NaType | None = None,
         type: str | NaType | None = None,  # pylint: disable=redefined-builtin
     ) -> None:
         """Initialize the instance returned by :meth:`__new__()`."""
@@ -534,14 +532,6 @@ def __init__(
         else:
             self._gpu_instance_id = self._compute_instance_id = NA
 
-        if gpu_cc_protected_memory is None and not hasattr(
-            self,
-            '_gpu_cc_protected_memory',
-        ):
-            gpu_cc_protected_memory = NA
-        if gpu_cc_protected_memory is not None:
-            self.set_gpu_cc_protected_memory(gpu_cc_protected_memory)
-
         for util in ('sm', 'memory', 'encoder', 'decoder'):
             if not hasattr(self, f'_gpu_{util}_utilization'):
                 setattr(self, f'_gpu_{util}_utilization', NA)
@@ -631,15 +621,6 @@ def gpu_memory_percent(self) -> float | NaType:  # in percentage
         """The percentage of used GPU memory by the process, or :const:`nvitop.NA` if not applicable."""
         return self._gpu_memory_percent
 
-    def gpu_cc_protected_memory(self) -> int | NaType:  # in bytes
-        """The used GPU conf compute protected memory in bytes, or :const:`nvitop.NA` if not applicable."""
-        return self._gpu_cc_protected_memory
-
-    def gpu_cc_protected_memory_human(self) -> str | NaType:  # in human readable
-        # pylint: disable-next=line-too-long
-        """The used GPU conf compute protected memory in human readable format, or :const:`nvitop.NA` if not applicable."""
-        return self._gpu_cc_protected_memory_human
-
     def gpu_sm_utilization(self) -> int | NaType:  # in percentage
         """The utilization rate of SM (Streaming Multiprocessor), or :const:`nvitop.NA` if not applicable."""
         return self._gpu_sm_utilization
@@ -667,12 +648,6 @@ def set_gpu_memory(self, value: int | NaType) -> None:
             gpu_memory_percent = round(100.0 * memory_used / memory_total, 1)  # type: ignore[assignment]
         self._gpu_memory_percent = gpu_memory_percent
 
-    def set_gpu_cc_protected_memory(self, value: int | NaType) -> None:
-        """Set the used GPU conf compute protected memory in bytes."""
-        # pylint: disable=attribute-defined-outside-init
-        self._gpu_cc_protected_memory = value
-        self._gpu_cc_protected_memory_human = bytes2human(self.gpu_cc_protected_memory())
-
     def set_gpu_utilization(
         self,
         gpu_sm_utilization: int | NaType | None = None,
@@ -694,15 +669,13 @@ def set_gpu_utilization(
     def update_gpu_status(self) -> int | NaType:
         """Update the GPU consumption status from a new NVML query."""
         self.set_gpu_memory(NA)
-        self.set_gpu_cc_protected_memory(NA)
         self.set_gpu_utilization(NA, NA, NA, NA)
         processes = self.device.processes()
         process = processes.get(self.pid, self)
         if process is not self:
             # The current process is gone and the instance has been removed from the cache.
             # Update GPU status from the new instance.
             self.set_gpu_memory(process.gpu_memory())
-            self.set_gpu_cc_protected_memory(process.gpu_cc_protected_memory())
             self.set_gpu_utilization(
                 process.gpu_sm_utilization(),
                 process.gpu_memory_utilization(),
@@ -1031,8 +1004,6 @@ def as_snapshot(
             gpu_memory_utilization=self.gpu_memory_utilization(),
             gpu_encoder_utilization=self.gpu_encoder_utilization(),
             gpu_decoder_utilization=self.gpu_decoder_utilization(),
-            gpu_cc_protected_memory=self.gpu_cc_protected_memory(),
-            gpu_cc_protected_memory_human=self.gpu_cc_protected_memory_human(),
         )
 
     @classmethod