From 960ec29f3230dbe7602aa9ef60c6960e7db4f16d Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Tue, 5 Nov 2024 19:14:26 +0000 Subject: [PATCH 1/3] Fix NVIDIA driver issue on focal, jammy and rl9 --- .../applications/nvml/centos_rhel/install | 6 +++--- .../applications/nvml/debian_ubuntu/install | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install b/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install index 017c8738a0..8f700d4116 100644 --- a/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install @@ -18,7 +18,7 @@ install_cuda_from_runfile() { # Remove existing installation before using the runfile remove_cuda_package remove_driver_package - sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils + sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils git # Installing latest version of NVIDIA CUDA and driver # Data Center/Tesla drivers and CUDA are released on different schedules; @@ -42,7 +42,7 @@ install_cuda_from_runfile() { setup_repo() { # Enable EPEL (Extra Packages for Enterprise Linux) for packages such as DKMS # Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9 - sudo yum install -y yum-utils epel-release + sudo yum install -y yum-utils epel-release git sudo yum-config-manager \ --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo sudo yum clean all @@ -51,7 +51,7 @@ setup_repo() { install_cuda_from_package_manager() { setup_repo install_driver_package - sudo yum -y install cuda + sudo yum -y install cuda-toolkit cuda-demo* verify_driver } diff --git a/integration_test/third_party_apps_test/applications/nvml/debian_ubuntu/install b/integration_test/third_party_apps_test/applications/nvml/debian_ubuntu/install index 58af03dcea..5c3cdf85f6 100644 --- a/integration_test/third_party_apps_test/applications/nvml/debian_ubuntu/install +++ b/integration_test/third_party_apps_test/applications/nvml/debian_ubuntu/install @@ -12,8 +12,8 @@ sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common p # manager's package is not working or not compitible with the GPU model DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g') echo "Installing latest version of NVIDIA CUDA and driver" -wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb +wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb if [[ $ID == debian ]]; then sudo add-apt-repository contrib fi @@ -34,7 +34,7 @@ case $DEVICE_CODE in ;; *) # For newer GPUs, install the latest version - sudo apt -y install cuda + sudo apt -y install cuda-12-6 ;; esac From c80a545bd1a3ac3f7eebde9db1a2ab4d5ea5749a Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Tue, 5 Nov 2024 20:35:09 +0000 Subject: [PATCH 2/3] Fix DCGM tests --- .../applications/dcgm/centos_rhel/install | 2 +- .../applications/dcgm/debian_ubuntu/install | 6 +++--- .../applications/dcgmv1/centos_rhel/install | 2 +- .../applications/dcgmv1/debian_ubuntu/install | 6 +++--- .../applications/nvml/centos_rhel/install | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install b/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install index 98698bf9f9..b6c65a18e2 100644 --- a/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install @@ -51,7 +51,7 @@ setup_repo() { install_cuda_from_package_manager() { setup_repo install_driver_package - sudo yum -y install cuda + sudo yum -y install cuda-toolkit cuda-demo* verify_driver } diff --git a/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install b/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install index e926f10514..170ab4b480 100644 --- a/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install +++ b/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install @@ -11,8 +11,8 @@ sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common p # manager's package is not working or not compitible with the GPU model DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g') # Need to add the keyring for installing CUDA and DCGM -wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb +wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb echo "Installing latest version of NVIDIA CUDA and driver" if [[ $ID == debian ]]; then sudo add-apt-repository contrib @@ -34,7 +34,7 @@ case $DEVICE_CODE in ;; *) # For newer GPUs, install the latest version - sudo apt -y install cuda + sudo apt -y install cuda-12-6 ;; esac diff --git a/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install b/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install index 98698bf9f9..d1c1263f25 100644 --- a/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install @@ -51,7 +51,7 @@ setup_repo() { install_cuda_from_package_manager() { setup_repo install_driver_package - sudo yum -y install cuda + sudo yum -y install cuda-toolkit cuda-demo* verify_driver } diff --git a/integration_test/third_party_apps_test/applications/dcgmv1/debian_ubuntu/install b/integration_test/third_party_apps_test/applications/dcgmv1/debian_ubuntu/install index e926f10514..170ab4b480 100644 --- a/integration_test/third_party_apps_test/applications/dcgmv1/debian_ubuntu/install +++ b/integration_test/third_party_apps_test/applications/dcgmv1/debian_ubuntu/install @@ -11,8 +11,8 @@ sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common p # manager's package is not working or not compitible with the GPU model DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g') # Need to add the keyring for installing CUDA and DCGM -wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.0-1_all.deb -sudo dpkg -i cuda-keyring_1.0-1_all.deb +wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb echo "Installing latest version of NVIDIA CUDA and driver" if [[ $ID == debian ]]; then sudo add-apt-repository contrib @@ -34,7 +34,7 @@ case $DEVICE_CODE in ;; *) # For newer GPUs, install the latest version - sudo apt -y install cuda + sudo apt -y install cuda-12-6 ;; esac diff --git a/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install b/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install index 8f700d4116..91352ff9b3 100644 --- a/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install @@ -18,7 +18,7 @@ install_cuda_from_runfile() { # Remove existing installation before using the runfile remove_cuda_package remove_driver_package - sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils git + sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils # Installing latest version of NVIDIA CUDA and driver # Data Center/Tesla drivers and CUDA are released on different schedules; @@ -42,7 +42,7 @@ install_cuda_from_runfile() { setup_repo() { # Enable EPEL (Extra Packages for Enterprise Linux) for packages such as DKMS # Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9 - sudo yum install -y yum-utils epel-release git + sudo yum install -y yum-utils epel-release sudo yum-config-manager \ --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$VERSION_ID/x86_64/cuda-rhel$VERSION_ID.repo sudo yum clean all From 321c9273706cb8d9cfcd1a201d0c0b8643620952 Mon Sep 17 00:00:00 2001 From: Lujie Duan Date: Tue, 5 Nov 2024 22:06:13 +0000 Subject: [PATCH 3/3] Added comments with todo to remove fix --- .../applications/dcgm/centos_rhel/install | 1 + .../applications/dcgm/debian_ubuntu/install | 1 + .../applications/dcgmv1/centos_rhel/install | 1 + .../applications/dcgmv1/debian_ubuntu/install | 1 + .../applications/nvml/centos_rhel/install | 3 ++- .../applications/nvml/debian_ubuntu/install | 1 + 6 files changed, 7 insertions(+), 1 deletion(-) diff --git a/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install b/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install index b6c65a18e2..9831b116f8 100644 --- a/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install @@ -51,6 +51,7 @@ setup_repo() { install_cuda_from_package_manager() { setup_repo install_driver_package + # TODO(b/377558109): remove the temporary fix once the repo is updated sudo yum -y install cuda-toolkit cuda-demo* verify_driver } diff --git a/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install b/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install index 170ab4b480..563ef22a13 100644 --- a/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install +++ b/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install @@ -34,6 +34,7 @@ case $DEVICE_CODE in ;; *) # For newer GPUs, install the latest version + # TODO(b/377558109): remove the temporary fix once the repo is updated sudo apt -y install cuda-12-6 ;; esac diff --git a/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install b/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install index d1c1263f25..674ef516da 100644 --- a/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install @@ -51,6 +51,7 @@ setup_repo() { install_cuda_from_package_manager() { setup_repo install_driver_package + # TODO(b/377558109): remove the temporary fix once the repo is updated sudo yum -y install cuda-toolkit cuda-demo* verify_driver } diff --git a/integration_test/third_party_apps_test/applications/dcgmv1/debian_ubuntu/install b/integration_test/third_party_apps_test/applications/dcgmv1/debian_ubuntu/install index 170ab4b480..563ef22a13 100644 --- a/integration_test/third_party_apps_test/applications/dcgmv1/debian_ubuntu/install +++ b/integration_test/third_party_apps_test/applications/dcgmv1/debian_ubuntu/install @@ -34,6 +34,7 @@ case $DEVICE_CODE in ;; *) # For newer GPUs, install the latest version + # TODO(b/377558109): remove the temporary fix once the repo is updated sudo apt -y install cuda-12-6 ;; esac diff --git a/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install b/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install index 91352ff9b3..2736d89835 100644 --- a/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install @@ -18,7 +18,7 @@ install_cuda_from_runfile() { # Remove existing installation before using the runfile remove_cuda_package remove_driver_package - sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils + sudo yum install -y kernel-devel-$(uname -r) pciutils gcc make wget yum-utils # Installing latest version of NVIDIA CUDA and driver # Data Center/Tesla drivers and CUDA are released on different schedules; @@ -51,6 +51,7 @@ setup_repo() { install_cuda_from_package_manager() { setup_repo install_driver_package + # TODO(b/377558109): remove the temporary fix once the repo is updated sudo yum -y install cuda-toolkit cuda-demo* verify_driver } diff --git a/integration_test/third_party_apps_test/applications/nvml/debian_ubuntu/install b/integration_test/third_party_apps_test/applications/nvml/debian_ubuntu/install index 5c3cdf85f6..a4dfd08bd5 100644 --- a/integration_test/third_party_apps_test/applications/nvml/debian_ubuntu/install +++ b/integration_test/third_party_apps_test/applications/nvml/debian_ubuntu/install @@ -34,6 +34,7 @@ case $DEVICE_CODE in ;; *) # For newer GPUs, install the latest version + # TODO(b/377558109): remove the temporary fix once the repo is updated sudo apt -y install cuda-12-6 ;; esac