Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fedora: Add Support For Applying A Single Patch #36

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 26 additions & 23 deletions ci/fedora/.gitlab-ci-fcos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
# the commit sha). This release stage is purely to test out the CICD code that
# would for the 'fedora' branch publish to a remote repository.
#
# Branches == "fedora" and tags == .*fedora$
# Tags == .*fedora$
#
# The protected branch 'fedora' will cause container image builds on all three
# Matching pipelines will cause container image builds on all three
# fcos runner types and build ALL_DRIVER_VERSIONS. The images will then be scan-
# ned and providing there are no detected vulnerabilities will be pushed to the
# remote repository defined by RELEASE_REGISTRY_PROJECT.
Expand All @@ -49,7 +49,7 @@
#
# Branches == "fedora.+"
#
# Any other protected branch with the word fedora in it will do the same - build
# Any protected branch with the word fedora in it will do the same - build
# all the NVIDIA driver versions on all the fcos releases - and scan them, but
# will not publish them to the remote registry.
#
Expand Down Expand Up @@ -96,10 +96,10 @@ variables:
# To survey latest Data Center driver versions available:
# https://www.nvidia.com/Download/Find.aspx
# https://www.nvidia.com/en-us/drivers/unix/
DRIVER_VERSION: "535.154.05"
DRIVER_VERSIONS: 535.154.05 525.147.05
DRIVER_VERSION: "550.90.07"
DRIVER_VERSIONS: 550.90.07 535.183.01

CUDA_VERSION: 12.2.0
CUDA_VERSION: 12.4.1

CVE_UPDATES: "curl libc6"

Expand All @@ -115,9 +115,9 @@ variables:
RELEASE_REGISTRY_TOKEN: ""

default:
image: docker:20.10.10-git
image: docker:25.0.2-git
services:
- name: docker:20.10.10-dind
- name: docker:25.0.2-dind

stages:
- build
Expand Down Expand Up @@ -199,8 +199,9 @@ build-push-next-one-only:
- for driver_version in ${DRIVER_VERSION}; do build_push_fn ${driver_version} $OVERWRITE_TAGS ${CI_COMMIT_SHORT_SHA}-; done
tags:
- fcos-next
except:
- /fedora/
rules:
# Only run on branches (not tags) which do not start with fedora
- if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null

build-push:
stage: build
Expand All @@ -212,8 +213,8 @@ build-push:
- STREAM: [next, testing, stable]
tags:
- fcos-${STREAM}
only:
- /fedora/
rules:
- if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/

.common-scan:
image: registry.gitlab.com/security-products/container-scanning:6
Expand Down Expand Up @@ -273,8 +274,9 @@ scan-next-one-only:
- scan_fn ${DRIVER_VERSION} ${CI_COMMIT_SHORT_SHA}-
tags:
- fcos-next
except:
- /fedora/
rules:
# Only run on branches (not tags) which do not matching fedora
- if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null

# Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies.
# https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2
Expand All @@ -288,24 +290,24 @@ scan-next:
needs: ["build-push: [next]"]
tags:
- fcos-next
only:
- /fedora/
rules:
- if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/

scan-testing:
extends: .common-scan
needs: ["build-push: [testing]"]
tags:
- fcos-testing
only:
- /fedora/
rules:
- if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/

scan-stable:
extends: .common-scan
needs: ["build-push: [stable]"]
tags:
- fcos-stable
only:
- /fedora/
rules:
- if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/

.common-release-fn-script: &common-release-fn-script
- |
Expand Down Expand Up @@ -347,7 +349,7 @@ scan-stable:
- docker login -u "${RELEASE_REGISTRY_USER}" -p "${RELEASE_REGISTRY_TOKEN}" "${RELEASE_REGISTRY}"
- for driver_version in ${DRIVER_VERSIONS:-${DRIVER_VERSION}}; do release_fn ${driver_version};done
rules:
- if: $CI_COMMIT_TAG =~ /fedora$/ || $CI_COMMIT_REF_NAME == 'fedora'
- if: $CI_COMMIT_TAG =~ /fedora$/

release-next-one-only:
stage: release
Expand All @@ -370,8 +372,9 @@ release-next-one-only:
- for driver_version in ${DRIVER_VERSION}; do release_fn ${driver_version} ${OVERWRITE_REMOTE_TAGS} ${CI_COMMIT_SHORT_SHA}-; done
tags:
- fcos-next
except:
- /fedora/
rules:
# Only run on branches (not tags) which do not matching ^fedora
- if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null

# Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies.
# https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2
Expand Down
4 changes: 3 additions & 1 deletion fedora/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ SHELL ["/bin/bash", "-c"]

RUN dnf install -y git wget

ENV GOLANG_VERSION=1.21.5
ENV GOLANG_VERSION=1.22.2

# download appropriate binary based on the target architecture for multi-arch builds
RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \
Expand Down Expand Up @@ -63,6 +63,8 @@ ENV NVIDIA_VISIBLE_DEVICES=void
# getopt etc.
RUN dnf install -y util-linux 'dnf-command(download)'

RUN dnf install -y patch

ADD install.sh /tmp/

RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \
Expand Down
125 changes: 78 additions & 47 deletions fedora/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Currently built driver versions are specified in `ci/fedora/.common-ci-fcos.yml`
The driver container is privileged, and here we choose to launch via podman instead of docker although both work.

```bash
$ DRIVER_VERSION=535.104.12 # Check ci/fedora/.common-ci-fcos.yml for latest
$ DRIVER_VERSION=550.90.07 # Check ci/fedora/.common-ci-fcos.yml for latest driver versions
$ FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2)
$ podman run -d --privileged --pid=host \
-v /run/nvidia:/run/nvidia:shared \
Expand All @@ -36,13 +36,14 @@ $ podman run -d --privileged --pid=host \
registry.gitlab.com/container-toolkit-fcos/driver:${DRIVER_VERSION}-fedora$$FEDORA_VERSION_ID
```

Or, on FCOS registering as a systemd unit via an ignition snippet, and using an image with kernel headers pre-installed for faster start up:
Or, on FCOS registering as a systemd unit via an ignition snippet. In this unit we attempt to pull a driver image matching the running kernel version (with pre-compiled kernel headers), but fall back to a generic Fedora version if one does not exist. Furthermore, we
mount a single patch file from a host directory that, if detected, will be applied to the generic Fedora version.

```yaml
variant: fcos
version: 1.4.0
storage:
files:
version: 1.5.0
systemd:
units:
- name: acme-nvidia-driver.service
enabled: true
contents: |
Expand All @@ -57,18 +58,32 @@ storage:
ExecStartPre=-/bin/podman rm nvidia-driver
ExecStartPre=-setenforce 0
ExecStartPre=-/bin/mkdir -p /run/nvidia
ExecStartPre=-/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \
/bin/podman pull registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID'
# 5/17/24 - Without the following line the nvidia driver container will crash with no meaningful error message
ExecStartPre=-/usr/sbin/modprobe video
ExecStart=/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \
/bin/podman run --name nvidia-driver \
-v /run/nvidia:/run/nvidia:shared \
-v /var/log:/var/log \
--privileged --pid=host \
# No need for network IF using container image with pre-built kernel headers \
--network=none \
registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID \
--accept-license'

# If there is a kernel-specific image (with pre-compiled kernel headers) then
# use it, otherwise fallback to the generic Fedora image mounting any patches that exist.
#
# Replace registry.gitlab.com/container-toolkit-fcos/driver with the registry name
# of your built/published driver images, or perhaps, docker.io/fifofonix/driver
ExecStart=/bin/sh -c ' \
FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \
KERNEL_VERSION=$(/bin/uname -r); \
if /bin/podman manifest inspect registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID > /dev/null; then \
IMAGE_NAME=registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID; \
else \
IMAGE_NAME=registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-fedora$$FEDORA_VERSION_ID; \
PATCH_MOUNT="-v /var/acme/nvidia-driver-patch:/patch"
fi; \
/bin/podman pull $$IMAGE_NAME; \
/bin/podman run --name nvidia-driver \
-v /run/nvidia:/run/nvidia:shared \
-v /var/log:/var/log \
$$PATCH_MOUNT \
--privileged \
--pid host \
$$IMAGE_NAME \
--accept-license'

ExecStop=/bin/podman stop nvidia-driver
Restart=on-failure
Expand All @@ -84,47 +99,64 @@ You should be able to step into the driver container and run the `nvidia-smi` to

```bash
$ # Assumes you're running the driver container via podman and named nvidia-driver as above...
$ podman exec -it nvidia-driver bash
[root@8dc88dad905e nvidia-510.47.03]# nvidia-smi
Wed May 25 15:24:00 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
| 0% 39C P0 197W / 300W | 22022MiB / 23028MiB | 96% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
[root@8dc88dad905e]#
$ podman exec -it nvidia-driver sh
sh-5.2# nvidia-smi
Tue Jun 11 19:55:25 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 Tesla M60 On | 00000000:00:1E.0 Off | 0 |
| N/A 47C P0 46W / 150W | 7131MiB / 7680MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
```

### Install Container Runtime / Toolkit

To run a CUDA container that leverages the NVIDIA driver container you now have running, install the separate NVIDIA container runtime and register it with your container runtime system (e.g. docker) following NVIDIA's instructions [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).

On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, docker is shown, but containerd works too for example):
On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, containerd is shown, but docker works too for example):

```yaml
variant: fcos
version: 1.4.0
version: 1.5.0
storage:
files:
- name: acme-layer-nvidia-container-runtime.service
- path: /etc/nvidia-container-runtime/config.toml
mode: 0644
contents:
inline: |
[nvidia-container-cli]
#debug = "/var/log/nvidia-container-toolkit.log"
root = "/run/nvidia/driver"
path = "/usr/bin/nvidia-container-cli"
# Improvements made in NVIDIA container toolkit 1.15.0 do not yet seem to correctly
# support FCOS so we still need to explicitly add the driver path to ld.so.conf
- path: /etc/ld.so.conf.d/container-toolkit.conf
mode: 0644
contents:
inline: |
/run/nvidia/driver/usr/lib64
systemd:
units:
- name: acme-layer-nvidia-container-toolkit.service
enabled: true
# We run before `zincati.service` to avoid conflicting rpm-ostree transactions.
contents: |
[Unit]
Wants=network-online.target
After=network-online.target
Before=zincati.service
ConditionPathExists=!/var/lib/%N.stamp
Expand All @@ -137,13 +169,12 @@ storage:
ExecStartPre=-/bin/sh -c 'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
> /etc/yum.repos.d/nvidia-container-toolkit.repo'
# Perhaps consider pinning the rpm version here depending on change aversion...
ExecStart=/usr/bin/rpm-ostree install --idempotent --allow-inactive --apply-live nvidia-container-toolkit
ExecStart=/bin/sh -c 'echo "/run/nvidia/driver/usr/lib64" > /etc/ld.so.conf.d/nv.conf; ldconfig'
# If we see that the nvidia-ctk is present, then we can configure docker...
ExecStart=/usr/bin/rpm-ostree install -y --idempotent --allow-inactive nvidia-container-toolkit
ExecStart=/bin/sh -c 'if [[ -f /usr/bin/nvidia-ctk ]]; then \
/usr/bin/nvidia-ctk runtime configure --runtime=docker --nvidia-set-as-default; \
systemctl restart docker; \
/usr/bin/nvidia-ctk runtime configure --runtime=containerd --nvidia-set-as-default; \
systemctl restart containerd; \
/bin/touch /var/lib/%N.stamp; fi'
ExecStart=/bin/systemctl --no-block reboot
Restart=on-failure
RestartSec=60

Expand Down
31 changes: 25 additions & 6 deletions fedora/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ _install_prerequisites() (
rm ./*.rpm

echo "Installing Linux kernel-modules-core files..."
if ! dnf -q -y download kernel-modules-core${KERNEL_VERSION} > /dev/null; then
if ! dnf -q -y download kernel-modules-core-${KERNEL_VERSION} > /dev/null; then
echo "Failed to find kernel-modules-core-${KERNEL_VERSION} in repositories."
echo "Trying to download kernel-modules-core from koji..."
KOJI_KERNEL_CORE_RPM=$KOJI_BASE_URL/packages/kernel/$KERNEL_RPM_VERSION/$KERNEL_RPM_RELEASE/$KERNEL_RPM_ARCH/kernel-modules-core-$KERNEL_VERSION.rpm
Expand Down Expand Up @@ -240,9 +240,9 @@ _create_driver_package() (
# lrwxrwxrwx 1 root root 36 Dec 8 20:10 default -> /etc/alternatives/ofa_kernel_headers
# drwxr-xr-x 4 root root 4096 Dec 8 20:14 x86_64
# lrwxrwxrwx 1 root root 44 Dec 9 19:05 5.4.0-90-generic -> /usr/src/ofa_kernel/x86_64/5.4.0-90-generic/
if [[ -d /run/mellanox/drivers/usr/src/ofa_kernel/x86_64/`uname -r` ]]; then
if [[ ! -e /usr/src/ofa_kernel/`uname -r` ]]; then
ln -s /run/mellanox/drivers/usr/src/ofa_kernel/x86_64/`uname -r` /usr/src/ofa_kernel/
if [[ -d "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" ]]; then
if [[ ! -e "/usr/src/ofa_kernel/$(uname -r)" ]]; then
ln -s "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" /usr/src/ofa_kernel/
fi
fi
fi
Expand Down Expand Up @@ -595,14 +595,33 @@ _start_vgpu_topology_daemon() {
nvidia-topologyd
}

_apply_patch () {
# Apply a single *.patch file that has been mounted to /patch
if [ -d /patch ]; then
# Exit if multiple patches are found
if [ $(ls -1 /patch/*.patch | wc -l) -gt 1 ]; then
echo "Multiple patches found, only one patch is supported"
exit 1
fi
for patch in /patch/*.patch; do
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run --apply-patch ${patch} -m=${KERNEL_TYPE}
# The patched installer has by default the file name ending '-custom.run'
PATCHED_SUFFIX="-custom"
echo -e "NVIDIA Software installer patched with '/patch/${patch}'\n"
done
fi
}

_prepare() {
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
fi

_apply_patch

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x -m=${KERNEL_TYPE} && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION${PATCHED_SUFFIX:-}.run -x -m=${KERNEL_TYPE} && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION${PATCHED_SUFFIX:-} && \
sh /tmp/install.sh nvinstall && \
mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \
Expand Down