Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kmod-5.10-nvidia: move to R535 branch from R470 #181

Merged
merged 1 commit into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/kmod-5.10-nvidia/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
NVidiaEULAforAWS.pdf
COPYING
*.rpm
23 changes: 19 additions & 4 deletions packages/kmod-5.10-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,28 @@ url = "https://s3.amazonaws.com/EULA/NVidiaEULAforAWS.pdf"
sha512 = "e1926fe99afc3ab5b2f2744fcd53b4046465aefb2793e2e06c4a19455a3fde895e00af1415ff1a5804c32e6a2ed0657e475de63da6c23a0e9c59feeef52f3f58"

[[package.metadata.build-package.external-files]]
url = "https://us.download.nvidia.com/tesla/470.256.02/NVIDIA-Linux-x86_64-470.256.02.run"
sha512 = "a837946dd24d7945c1962a695f1f31965f3ceb6927f52cd08fd51b8db138b7a888bbeab69243f5c8468a7bd7ccd47f5dbdb48a1ca81264866c1ebb7d88628f88"
url = "https://us.download.nvidia.com/tesla/535.183.06/NVIDIA-Linux-x86_64-535.183.06.run"
sha512 = "424950ef303ea39499e96f8c90c1e0c83aee12309779d4f335769ef554ad4f7c38e98f69c64b408adc85a7cf51ea600d85222792402b9c6b7941f1af066d2a33"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://us.download.nvidia.com/tesla/470.256.02/NVIDIA-Linux-aarch64-470.256.02.run"
sha512 = "38eee5933355c34ca816a2ac0fbc4f55c19c20e1322891bfc98cb6b37d99a31218eea9314877ab0e3cf3ac6eb61f9d9d4d09d0af304b689f18b4efa721b65d5c"
url = "https://us.download.nvidia.com/tesla/535.183.06/NVIDIA-Linux-aarch64-535.183.06.run"
sha512 = "bb305f1703557461b0a0a29066c304658d9684841104c6f4d9ff44f9db90fee14ae619cd2fe3242823a5fe3a69b168b8174b163740014b15cdef36db88ba2d96"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/nvidia-fabric-manager-535.183.06-1.x86_64.rpm"
sha512 = "c3d98878363f857b2963665a0e485cb7b1afeaabd0040a970478d00ffb870ab4130ab9dfe1b7a40d1b38734636ebccec39fd1b3fc8c06abc5c07470f749b6025"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/sbsa/nvidia-fabric-manager-535.183.06-1.aarch64.rpm"
sha512 = "6a646cd7ea11e668f7dbe6f6bb22516107a856e3c3755f8693c91d4bed706b8b3667b853f07e84c2d0da4de7ab1107337b6a1493879d75d8c201bfe9da071b32"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://raw.githubusercontent.com/NVIDIA/open-gpu-kernel-modules/535/COPYING"
sha512 = "f9cee68cbb12095af4b4e92d01c210461789ef41c70b64efefd6719d0b88468b7a67a3629c432d4d9304c730b5d1a942228a5bcc74a03ab1c411c77c758cd938"
force-upstream = true

[build-dependencies]
Expand Down
20 changes: 20 additions & 0 deletions packages/kmod-5.10-nvidia/copy-open-gpu-kernel-modules.service.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[Unit]
Description=Copy open GPU kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
# Rerunning this service after the system is fully loaded will override
# the already linked kernel modules. This doesn't affect the running system,
# since kernel modules are linked early in the boot sequence, but we still
# disable manual restarts to prevent unnecessary kernel modules rewrites.
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu link-modules
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu-copy-only link-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=preconfigured.target
520 changes: 352 additions & 168 deletions packages/kmod-5.10-nvidia/kmod-5.10-nvidia.spec

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Unit]
Description=Link additional kernel modules
Description=Link Tesla kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
# Rerunning this service after the system is fully loaded will override
# the already linked kernel modules. This doesn't affect the running system,
Expand All @@ -10,7 +10,8 @@ RefuseManualStop=true

[Service]
Type=oneshot
ExecStart=/usr/bin/driverdog link-modules
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
ExecStart=/usr/bin/driverdog --modules-set nvidia-tesla link-modules
RemainAfterExit=true
StandardError=journal+console

Expand Down
19 changes: 19 additions & 0 deletions packages/kmod-5.10-nvidia/load-open-gpu-kernel-modules.service.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Load open GPU kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
After=copy-open-gpu-kernel-modules.service
Requires=copy-open-gpu-kernel-modules.service
# Disable manual restarts to prevent loading kernel modules
# that weren't linked by the running system
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu load-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=preconfigured.target
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Unit]
Description=Load additional kernel modules
Description=Load Tesla kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
After=link-tesla-kernel-modules.service
Requires=link-tesla-kernel-modules.service
Expand All @@ -10,7 +10,8 @@ RefuseManualStop=true

[Service]
Type=oneshot
ExecStart=/usr/bin/driverdog load-modules
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
ExecStart=/usr/bin/driverdog --modules-set nvidia-tesla load-modules
RemainAfterExit=true
StandardError=journal+console

Expand Down
34 changes: 34 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-fabricmanager.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Modern, systemd-aware settings:
# - Log to journal via stderr
# - Keep running in the foreground
LOG_LEVEL=4
LOG_FILE_NAME=
DAEMONIZE=0

# Use Unix domain sockets instead of localhost ports.
UNIX_SOCKET_PATH=/run/nvidia/fabricmanager.sock
FM_CMD_UNIX_SOCKET_PATH=/run/nvidia/fabricmanager-cmd.sock

# Start Fabric Manager in bare metal or full pass through virtualization mode.
FABRIC_MODE=0
FABRIC_MODE_RESTART=0

# Terminate on NVSwitch and GPU config failure.
FM_STAY_RESIDENT_ON_FAILURES=0

# When there is a GPU to NVSwitch NVLink failure, remove the GPU with the failure
# from NVLink P2P capability.
ACCESS_LINK_FAILURE_MODE=0

# When there is an NVSwitch to NVSwitch NVLink failure, exit Fabric Manager.
TRUNK_LINK_FAILURE_MODE=0

# When there is an NVSwitch failure or an NVSwitch is excluded, abort Fabric Manager.
NVSWITCH_FAILURE_MODE=0

# When Fabric Manager service is stopped or terminated, abort all running CUDA jobs.
ABORT_CUDA_JOBS_ON_FM_EXIT=1

# Path to topology and database files.
TOPOLOGY_FILE_PATH=/usr/share/nvidia/tesla/nvswitch
DATABASE_PATH=/usr/share/nvidia/tesla/nvswitch
16 changes: 16 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-fabricmanager.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Unit]
Description=NVIDIA fabric manager service

[Service]
ExecStart=/usr/libexec/nvidia/tesla/bin/nv-fabricmanager -c /etc/nvidia/fabricmanager.cfg
Type=simple
TimeoutSec=0
RestartSec=5
Restart=always
RemainAfterExit=true
StandardError=journal+console
SuccessExitStatus=255
LimitCORE=infinity

[Install]
WantedBy=multi-user.target
2 changes: 1 addition & 1 deletion packages/kmod-5.10-nvidia/nvidia-ld.so.conf.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__LIBDIR__/nvidia/tesla/__NVIDIA_VERSION__/
__LIBDIR__/nvidia/tesla/
11 changes: 11 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-open-gpu-config.toml.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[nvidia-open-gpu]
lib-modules-path = "kernel/drivers/extra/video/nvidia/open-gpu"

[nvidia-open-gpu.kernel-modules."nvidia.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu.kernel-modules."nvidia-modeset.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu.kernel-modules."nvidia-uvm.ko"]
copy-source = "__NVIDIA_MODULES__"
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[nvidia-open-gpu-copy-only]
lib-modules-path = "kernel/drivers/extra/video/nvidia/open-gpu"

[nvidia-open-gpu-copy-only.kernel-modules."nvidia-drm.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu-copy-only.kernel-modules."nvidia-peermem.ko"]
copy-source = "__NVIDIA_MODULES__"
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ After=load-tesla-kernel-modules.service load-open-gpu-kernel-modules.service

[Service]
Type=forking
ExecStart=__NVIDIA_BINDIR__/nvidia-persistenced --user nvidia --verbose
ExecStart=/usr/libexec/nvidia/tesla/bin/nvidia-persistenced --user nvidia --verbose

[Install]
RequiredBy=preconfigured.target
5 changes: 5 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-tesla-tmpfiles.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
C /etc/drivers/nvidia-tesla.toml
C /etc/drivers/nvidia-open-gpu.toml
C /etc/drivers/nvidia-open-gpu-copy-only.toml
C /etc/containerd/nvidia.env - - - - /usr/share/factory/nvidia/tesla/nvidia-path.env
C /etc/ld.so.conf.d/nvidia-tesla.conf
3 changes: 0 additions & 3 deletions packages/kmod-5.10-nvidia/nvidia-tesla-tmpfiles.conf.in

This file was deleted.

6 changes: 5 additions & 1 deletion packages/kmod-5.10-nvidia/nvidia-tmpfiles.conf.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/tesla - - - - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/tesla 0755 root root - -
D /var/run/nvidia-persistenced 0755 nvidia nvidia - -
R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/open-gpu - - - - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/open-gpu 0755 root root - -
C /etc/nvidia/fabricmanager.cfg - - - -
d /run/nvidia 0700 root root -
D /var/run/nvidia-persistenced 0755 nvidia nvidia - -