NVIDIA · fifofonix · Mar 27, 2024 · Jun 11, 2024 · Feb 14, 2024 · Apr 30, 2024
diff --git a/ci/fedora/.gitlab-ci-fcos.yml b/ci/fedora/.gitlab-ci-fcos.yml
@@ -35,9 +35,9 @@
 # the commit sha).  This release stage is purely to test out the CICD code that
 # would for the 'fedora' branch publish to a remote repository.
 #
-# Branches == "fedora" and tags == .*fedora$
+# Tags == .*fedora$
 #
-# The protected branch 'fedora' will cause container image builds on all three
+# Matching pipelines will cause container image builds on all three
 # fcos runner types and build ALL_DRIVER_VERSIONS.  The images will then be scan-
 # ned and providing there are no detected vulnerabilities will be pushed to the
 # remote repository defined by RELEASE_REGISTRY_PROJECT.
@@ -49,7 +49,7 @@
 #
 # Branches == "fedora.+"
 #
-# Any other protected branch with the word fedora in it will do the same - build
+# Any protected branch with the word fedora in it will do the same - build
 # all the NVIDIA driver versions on all the fcos releases - and scan them, but
 # will not publish them to the remote registry.
 #
@@ -96,10 +96,10 @@ variables:
   # To survey latest Data Center driver versions available:
   # https://www.nvidia.com/Download/Find.aspx
   # https://www.nvidia.com/en-us/drivers/unix/
-  DRIVER_VERSION: "535.154.05"
-  DRIVER_VERSIONS: 535.154.05 525.147.05
+  DRIVER_VERSION: "550.90.07"
+  DRIVER_VERSIONS: 550.90.07 535.183.01
 
-  CUDA_VERSION: 12.2.0
+  CUDA_VERSION: 12.4.1
 
   CVE_UPDATES: "curl libc6"
 
@@ -115,9 +115,9 @@ variables:
   RELEASE_REGISTRY_TOKEN: ""
 
 default:
-  image: docker:20.10.10-git
+  image: docker:25.0.2-git
   services:
-    - name: docker:20.10.10-dind
+    - name: docker:25.0.2-dind
 
 stages:
   - build
@@ -199,8 +199,9 @@ build-push-next-one-only:
     - for driver_version in ${DRIVER_VERSION}; do build_push_fn ${driver_version} $OVERWRITE_TAGS ${CI_COMMIT_SHORT_SHA}-; done
   tags:
     - fcos-next
-  except:
-    - /fedora/
+  rules:
+    # Only run on branches (not tags) which do not start with fedora
+    - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null
 
 build-push:
   stage: build
@@ -212,8 +213,8 @@ build-push:
       - STREAM: [next, testing, stable]
   tags:
     - fcos-${STREAM}
-  only:
-    - /fedora/
+  rules:
+    - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/
 
 .common-scan:
   image: registry.gitlab.com/security-products/container-scanning:6
@@ -273,8 +274,9 @@ scan-next-one-only:
     - scan_fn ${DRIVER_VERSION} ${CI_COMMIT_SHORT_SHA}-
   tags:
     - fcos-next
-  except:
-    - /fedora/
+  rules:
+    # Only run on branches (not tags) which do not matching fedora
+    - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null
 
 # Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies.
 # https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2
@@ -288,24 +290,24 @@ scan-next:
   needs: ["build-push: [next]"]
   tags:
     - fcos-next
-  only:
-    - /fedora/
+  rules:
+    - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/
 
 scan-testing:
   extends: .common-scan
   needs: ["build-push: [testing]"]
   tags:
     - fcos-testing
-  only:
-    - /fedora/
+  rules:
+    - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/
 
 scan-stable:
   extends: .common-scan
   needs: ["build-push: [stable]"]
   tags:
     - fcos-stable
-  only:
-    - /fedora/
+  rules:
+    - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/
 
 .common-release-fn-script: &common-release-fn-script
   - |
@@ -347,7 +349,7 @@ scan-stable:
     - docker login -u "${RELEASE_REGISTRY_USER}" -p "${RELEASE_REGISTRY_TOKEN}" "${RELEASE_REGISTRY}"
     - for driver_version in ${DRIVER_VERSIONS:-${DRIVER_VERSION}}; do release_fn ${driver_version};done
   rules:
-    - if: $CI_COMMIT_TAG =~ /fedora$/ || $CI_COMMIT_REF_NAME == 'fedora'
+    - if: $CI_COMMIT_TAG =~ /fedora$/
 
 release-next-one-only:
   stage: release
@@ -370,8 +372,9 @@ release-next-one-only:
     - for driver_version in ${DRIVER_VERSION}; do release_fn ${driver_version} ${OVERWRITE_REMOTE_TAGS} ${CI_COMMIT_SHORT_SHA}-; done
   tags:
     - fcos-next
-  except:
-    - /fedora/
+  rules:
+    # Only run on branches (not tags) which do not matching ^fedora
+    - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null
 
 # Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies.
 # https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2

diff --git a/fedora/Dockerfile b/fedora/Dockerfile
@@ -9,7 +9,7 @@ SHELL ["/bin/bash", "-c"]
 
 RUN dnf install -y git wget
 
-ENV GOLANG_VERSION=1.21.5
+ENV GOLANG_VERSION=1.22.2
 
 # download appropriate binary based on the target architecture for multi-arch builds
 RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \
@@ -63,6 +63,8 @@ ENV NVIDIA_VISIBLE_DEVICES=void
 # getopt etc.
 RUN dnf install -y util-linux 'dnf-command(download)'
 
+RUN dnf install -y patch
+
 ADD install.sh /tmp/
 
 RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \

diff --git a/fedora/README.md b/fedora/README.md
@@ -27,7 +27,7 @@ Currently built driver versions are specified in `ci/fedora/.common-ci-fcos.yml`
 The driver container is privileged, and here we choose to launch via podman instead of docker although both work.
 
 ```bash
-$ DRIVER_VERSION=535.104.12 # Check ci/fedora/.common-ci-fcos.yml for latest
+$ DRIVER_VERSION=550.90.07 # Check ci/fedora/.common-ci-fcos.yml for latest driver versions
 $ FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2)
 $ podman run -d --privileged --pid=host \
      -v /run/nvidia:/run/nvidia:shared \
@@ -36,13 +36,14 @@ $ podman run -d --privileged --pid=host \
      registry.gitlab.com/container-toolkit-fcos/driver:${DRIVER_VERSION}-fedora$$FEDORA_VERSION_ID
 ```
 
-Or, on FCOS registering as a systemd unit via an ignition snippet, and using an image with kernel headers pre-installed for faster start up:
+Or, on FCOS registering as a systemd unit via an ignition snippet. In this unit we attempt to pull a driver image matching the running kernel version (with pre-compiled kernel headers), but fall back to a generic Fedora version if one does not exist. Furthermore, we
+mount a single patch file from a host directory that, if detected, will be applied to the generic Fedora version.
 
 ```yaml
 variant: fcos
-version: 1.4.0
-storage:
-  files:
+version: 1.5.0
+systemd:
+  units:
     - name: acme-nvidia-driver.service
       enabled: true
       contents: |
@@ -57,18 +58,32 @@ storage:
         ExecStartPre=-/bin/podman rm nvidia-driver
         ExecStartPre=-setenforce 0
         ExecStartPre=-/bin/mkdir -p /run/nvidia
-        ExecStartPre=-/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \
-            /bin/podman pull registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID'
+        # 5/17/24 - Without the following line the nvidia driver container will crash with no meaningful error message
         ExecStartPre=-/usr/sbin/modprobe video
-        ExecStart=/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \
-            /bin/podman run --name nvidia-driver \
-                -v /run/nvidia:/run/nvidia:shared \
-                -v /var/log:/var/log \
-                --privileged --pid=host \
-                # No need for network IF using container image with pre-built kernel headers \
-                --network=none \
-                registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID \
-                            --accept-license'
+
+        # If there is a kernel-specific image (with pre-compiled kernel headers) then
+        # use it, otherwise fallback to the generic Fedora image mounting any patches that exist.
+        #
+        # Replace registry.gitlab.com/container-toolkit-fcos/driver with the registry name
+        # of your built/published driver images, or perhaps, docker.io/fifofonix/driver
+        ExecStart=/bin/sh -c ' \
+          FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \
+          KERNEL_VERSION=$(/bin/uname -r); \
+          if /bin/podman manifest inspect registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID > /dev/null; then \
+            IMAGE_NAME=registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID; \
+          else \
+            IMAGE_NAME=registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-fedora$$FEDORA_VERSION_ID; \
+            PATCH_MOUNT="-v /var/acme/nvidia-driver-patch:/patch"
+          fi; \
+          /bin/podman pull $$IMAGE_NAME; \
+          /bin/podman run --name nvidia-driver \
+            -v /run/nvidia:/run/nvidia:shared \
+            -v /var/log:/var/log \
+            $$PATCH_MOUNT \
+            --privileged \
+            --pid host \
+            $$IMAGE_NAME \
+                --accept-license'
 
         ExecStop=/bin/podman stop nvidia-driver
         Restart=on-failure
@@ -84,47 +99,64 @@ You should be able to step into the driver container and run the `nvidia-smi` to
 
 ```bash
 $ # Assumes you're running the driver container via podman and named nvidia-driver as above...
-$ podman exec -it nvidia-driver bash
-[root@8dc88dad905e nvidia-510.47.03]# nvidia-smi
-Wed May 25 15:24:00 2022
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|                               |                      |               MIG M. |
-|===============================+======================+======================|
-|   0  NVIDIA A10G         On   | 00000000:00:1E.0 Off |                    0 |
-|  0%   39C    P0   197W / 300W |  22022MiB / 23028MiB |     96%      Default |
-|                               |                      |                  N/A |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes:                                                                  |
-|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
-|        ID   ID                                                   Usage      |
-|=============================================================================|
-|  No running processes found                                                 |
-+-----------------------------------------------------------------------------+
-[root@8dc88dad905e]#
+$ podman exec -it nvidia-driver sh
+sh-5.2# nvidia-smi
+Tue Jun 11 19:55:25 2024
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
+|-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  Tesla M60                      On  |   00000000:00:1E.0 Off |                    0 |
+| N/A   47C    P0             46W /  150W |    7131MiB /   7680MiB |      0%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
 ```
 
 ### Install Container Runtime / Toolkit
 
 To run a CUDA container that leverages the NVIDIA driver container you now have running, install the separate NVIDIA container runtime and register it with your container runtime system (e.g. docker) following NVIDIA's instructions [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
 
-On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, docker is shown, but containerd works too for example):
+On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, containerd is shown, but docker works too for example):
 
 ```yaml
 variant: fcos
-version: 1.4.0
+version: 1.5.0
 storage:
   files:
-    - name: acme-layer-nvidia-container-runtime.service
+    - path: /etc/nvidia-container-runtime/config.toml
+      mode: 0644
+      contents:
+        inline: |
+          [nvidia-container-cli]
+          #debug = "/var/log/nvidia-container-toolkit.log"
+          root = "/run/nvidia/driver"
+          path = "/usr/bin/nvidia-container-cli"
+    # Improvements made in NVIDIA container toolkit 1.15.0 do not yet seem to correctly
+    # support FCOS so we still need to explicitly add the driver path to ld.so.conf
+    - path: /etc/ld.so.conf.d/container-toolkit.conf
+      mode: 0644
+      contents:
+        inline: |
+          /run/nvidia/driver/usr/lib64
+systemd:
+  units:
+    - name: acme-layer-nvidia-container-toolkit.service
       enabled: true
       # We run before `zincati.service` to avoid conflicting rpm-ostree transactions.
       contents: |
         [Unit]
+        Wants=network-online.target
         After=network-online.target
         Before=zincati.service
         ConditionPathExists=!/var/lib/%N.stamp
@@ -137,13 +169,12 @@ storage:
         ExecStartPre=-/bin/sh -c 'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
             > /etc/yum.repos.d/nvidia-container-toolkit.repo'
         # Perhaps consider pinning the rpm version here depending on change aversion...
-        ExecStart=/usr/bin/rpm-ostree install --idempotent --allow-inactive --apply-live nvidia-container-toolkit
-        ExecStart=/bin/sh -c 'echo "/run/nvidia/driver/usr/lib64" > /etc/ld.so.conf.d/nv.conf; ldconfig'
-        # If we see that the nvidia-ctk is present, then we can configure docker...
+        ExecStart=/usr/bin/rpm-ostree install -y --idempotent --allow-inactive nvidia-container-toolkit
         ExecStart=/bin/sh -c 'if [[ -f /usr/bin/nvidia-ctk ]]; then \
-              /usr/bin/nvidia-ctk runtime configure --runtime=docker --nvidia-set-as-default; \
-              systemctl restart docker; \
+              /usr/bin/nvidia-ctk runtime configure --runtime=containerd --nvidia-set-as-default; \
+              systemctl restart containerd; \
               /bin/touch /var/lib/%N.stamp; fi'
+        ExecStart=/bin/systemctl --no-block reboot
         Restart=on-failure
         RestartSec=60
 

diff --git a/fedora/nvidia-driver b/fedora/nvidia-driver
@@ -129,7 +129,7 @@ _install_prerequisites() (
     rm ./*.rpm
 
     echo "Installing Linux kernel-modules-core files..."
-    if ! dnf -q -y download kernel-modules-core${KERNEL_VERSION} > /dev/null; then
+    if ! dnf -q -y download kernel-modules-core-${KERNEL_VERSION} > /dev/null; then
         echo "Failed to find kernel-modules-core-${KERNEL_VERSION} in repositories."
         echo "Trying to download kernel-modules-core from koji..."
         KOJI_KERNEL_CORE_RPM=$KOJI_BASE_URL/packages/kernel/$KERNEL_RPM_VERSION/$KERNEL_RPM_RELEASE/$KERNEL_RPM_ARCH/kernel-modules-core-$KERNEL_VERSION.rpm
@@ -240,9 +240,9 @@ _create_driver_package() (
         # lrwxrwxrwx 1 root root   36 Dec  8 20:10 default -> /etc/alternatives/ofa_kernel_headers
         # drwxr-xr-x 4 root root 4096 Dec  8 20:14 x86_64
         # lrwxrwxrwx 1 root root   44 Dec  9 19:05 5.4.0-90-generic -> /usr/src/ofa_kernel/x86_64/5.4.0-90-generic/
-        if [[ -d /run/mellanox/drivers/usr/src/ofa_kernel/x86_64/`uname -r` ]]; then
-            if [[ ! -e /usr/src/ofa_kernel/`uname -r` ]]; then
-                ln -s /run/mellanox/drivers/usr/src/ofa_kernel/x86_64/`uname -r` /usr/src/ofa_kernel/
+        if [[ -d "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" ]]; then
+            if [[ ! -e "/usr/src/ofa_kernel/$(uname -r)" ]]; then
+                ln -s "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" /usr/src/ofa_kernel/
             fi
         fi
     fi
@@ -595,14 +595,33 @@ _start_vgpu_topology_daemon() {
     nvidia-topologyd
 }
 
+_apply_patch () {
+    # Apply a single *.patch file that has been mounted to /patch
+    if [ -d /patch ]; then
+        # Exit if multiple patches are found
+        if [ $(ls -1 /patch/*.patch | wc -l) -gt 1 ]; then
+            echo "Multiple patches found, only one patch is supported"
+            exit 1
+        fi
+        for patch in /patch/*.patch; do
+            sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run --apply-patch ${patch} -m=${KERNEL_TYPE}
+            # The patched installer has by default the file name ending '-custom.run'
+            PATCHED_SUFFIX="-custom"
+            echo -e "NVIDIA Software installer patched with '/patch/${patch}'\n"
+        done
+    fi
+}
+
 _prepare() {
     if [ "${DRIVER_TYPE}" = "vgpu" ]; then
         _find_vgpu_driver_version || exit 1
     fi
 
+    _apply_patch
+
     # Install the userspace components and copy the kernel module sources.
-    sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x -m=${KERNEL_TYPE} && \
-        cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
+    sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION${PATCHED_SUFFIX:-}.run -x -m=${KERNEL_TYPE} && \
+        cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION${PATCHED_SUFFIX:-} && \
         sh /tmp/install.sh nvinstall && \
         mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \
         mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \