Skip to content

Commit

Permalink
Merge pull request #181 from yeazelm/470_to_535
Browse files Browse the repository at this point in the history
kmod-5.10-nvidia: move to R535 branch from R470
  • Loading branch information
yeazelm authored Oct 11, 2024
2 parents b00f0d6 + d36b035 commit e781c61
Show file tree
Hide file tree
Showing 16 changed files with 499 additions and 182 deletions.
2 changes: 2 additions & 0 deletions packages/kmod-5.10-nvidia/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
NVidiaEULAforAWS.pdf
COPYING
*.rpm
23 changes: 19 additions & 4 deletions packages/kmod-5.10-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,28 @@ url = "https://s3.amazonaws.com/EULA/NVidiaEULAforAWS.pdf"
sha512 = "e1926fe99afc3ab5b2f2744fcd53b4046465aefb2793e2e06c4a19455a3fde895e00af1415ff1a5804c32e6a2ed0657e475de63da6c23a0e9c59feeef52f3f58"

[[package.metadata.build-package.external-files]]
url = "https://us.download.nvidia.com/tesla/470.256.02/NVIDIA-Linux-x86_64-470.256.02.run"
sha512 = "a837946dd24d7945c1962a695f1f31965f3ceb6927f52cd08fd51b8db138b7a888bbeab69243f5c8468a7bd7ccd47f5dbdb48a1ca81264866c1ebb7d88628f88"
url = "https://us.download.nvidia.com/tesla/535.183.06/NVIDIA-Linux-x86_64-535.183.06.run"
sha512 = "424950ef303ea39499e96f8c90c1e0c83aee12309779d4f335769ef554ad4f7c38e98f69c64b408adc85a7cf51ea600d85222792402b9c6b7941f1af066d2a33"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://us.download.nvidia.com/tesla/470.256.02/NVIDIA-Linux-aarch64-470.256.02.run"
sha512 = "38eee5933355c34ca816a2ac0fbc4f55c19c20e1322891bfc98cb6b37d99a31218eea9314877ab0e3cf3ac6eb61f9d9d4d09d0af304b689f18b4efa721b65d5c"
url = "https://us.download.nvidia.com/tesla/535.183.06/NVIDIA-Linux-aarch64-535.183.06.run"
sha512 = "bb305f1703557461b0a0a29066c304658d9684841104c6f4d9ff44f9db90fee14ae619cd2fe3242823a5fe3a69b168b8174b163740014b15cdef36db88ba2d96"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/nvidia-fabric-manager-535.183.06-1.x86_64.rpm"
sha512 = "c3d98878363f857b2963665a0e485cb7b1afeaabd0040a970478d00ffb870ab4130ab9dfe1b7a40d1b38734636ebccec39fd1b3fc8c06abc5c07470f749b6025"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://developer.download.nvidia.com/compute/cuda/repos/rhel9/sbsa/nvidia-fabric-manager-535.183.06-1.aarch64.rpm"
sha512 = "6a646cd7ea11e668f7dbe6f6bb22516107a856e3c3755f8693c91d4bed706b8b3667b853f07e84c2d0da4de7ab1107337b6a1493879d75d8c201bfe9da071b32"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://raw.githubusercontent.com/NVIDIA/open-gpu-kernel-modules/535/COPYING"
sha512 = "f9cee68cbb12095af4b4e92d01c210461789ef41c70b64efefd6719d0b88468b7a67a3629c432d4d9304c730b5d1a942228a5bcc74a03ab1c411c77c758cd938"
force-upstream = true

[build-dependencies]
Expand Down
20 changes: 20 additions & 0 deletions packages/kmod-5.10-nvidia/copy-open-gpu-kernel-modules.service.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[Unit]
Description=Copy open GPU kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
# Rerunning this service after the system is fully loaded will override
# the already linked kernel modules. This doesn't affect the running system,
# since kernel modules are linked early in the boot sequence, but we still
# disable manual restarts to prevent unnecessary kernel modules rewrites.
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu link-modules
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu-copy-only link-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=preconfigured.target
520 changes: 352 additions & 168 deletions packages/kmod-5.10-nvidia/kmod-5.10-nvidia.spec

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Unit]
Description=Link additional kernel modules
Description=Link Tesla kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
# Rerunning this service after the system is fully loaded will override
# the already linked kernel modules. This doesn't affect the running system,
Expand All @@ -10,7 +10,8 @@ RefuseManualStop=true

[Service]
Type=oneshot
ExecStart=/usr/bin/driverdog link-modules
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
ExecStart=/usr/bin/driverdog --modules-set nvidia-tesla link-modules
RemainAfterExit=true
StandardError=journal+console

Expand Down
19 changes: 19 additions & 0 deletions packages/kmod-5.10-nvidia/load-open-gpu-kernel-modules.service.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Load open GPU kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
After=copy-open-gpu-kernel-modules.service
Requires=copy-open-gpu-kernel-modules.service
# Disable manual restarts to prevent loading kernel modules
# that weren't linked by the running system
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
ExecStart=/usr/bin/driverdog --modules-set nvidia-open-gpu load-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=preconfigured.target
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[Unit]
Description=Load additional kernel modules
Description=Load Tesla kernel modules
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
After=link-tesla-kernel-modules.service
Requires=link-tesla-kernel-modules.service
Expand All @@ -10,7 +10,8 @@ RefuseManualStop=true

[Service]
Type=oneshot
ExecStart=/usr/bin/driverdog load-modules
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
ExecStart=/usr/bin/driverdog --modules-set nvidia-tesla load-modules
RemainAfterExit=true
StandardError=journal+console

Expand Down
34 changes: 34 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-fabricmanager.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Modern, systemd-aware settings:
# - Log to journal via stderr
# - Keep running in the foreground
LOG_LEVEL=4
LOG_FILE_NAME=
DAEMONIZE=0

# Use Unix domain sockets instead of localhost ports.
UNIX_SOCKET_PATH=/run/nvidia/fabricmanager.sock
FM_CMD_UNIX_SOCKET_PATH=/run/nvidia/fabricmanager-cmd.sock

# Start Fabric Manager in bare metal or full pass through virtualization mode.
FABRIC_MODE=0
FABRIC_MODE_RESTART=0

# Terminate on NVSwitch and GPU config failure.
FM_STAY_RESIDENT_ON_FAILURES=0

# When there is a GPU to NVSwitch NVLink failure, remove the GPU with the failure
# from NVLink P2P capability.
ACCESS_LINK_FAILURE_MODE=0

# When there is an NVSwitch to NVSwitch NVLink failure, exit Fabric Manager.
TRUNK_LINK_FAILURE_MODE=0

# When there is an NVSwitch failure or an NVSwitch is excluded, abort Fabric Manager.
NVSWITCH_FAILURE_MODE=0

# When Fabric Manager service is stopped or terminated, abort all running CUDA jobs.
ABORT_CUDA_JOBS_ON_FM_EXIT=1

# Path to topology and database files.
TOPOLOGY_FILE_PATH=/usr/share/nvidia/tesla/nvswitch
DATABASE_PATH=/usr/share/nvidia/tesla/nvswitch
16 changes: 16 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-fabricmanager.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Unit]
Description=NVIDIA fabric manager service

[Service]
ExecStart=/usr/libexec/nvidia/tesla/bin/nv-fabricmanager -c /etc/nvidia/fabricmanager.cfg
Type=simple
TimeoutSec=0
RestartSec=5
Restart=always
RemainAfterExit=true
StandardError=journal+console
SuccessExitStatus=255
LimitCORE=infinity

[Install]
WantedBy=multi-user.target
2 changes: 1 addition & 1 deletion packages/kmod-5.10-nvidia/nvidia-ld.so.conf.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__LIBDIR__/nvidia/tesla/__NVIDIA_VERSION__/
__LIBDIR__/nvidia/tesla/
11 changes: 11 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-open-gpu-config.toml.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[nvidia-open-gpu]
lib-modules-path = "kernel/drivers/extra/video/nvidia/open-gpu"

[nvidia-open-gpu.kernel-modules."nvidia.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu.kernel-modules."nvidia-modeset.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu.kernel-modules."nvidia-uvm.ko"]
copy-source = "__NVIDIA_MODULES__"
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[nvidia-open-gpu-copy-only]
lib-modules-path = "kernel/drivers/extra/video/nvidia/open-gpu"

[nvidia-open-gpu-copy-only.kernel-modules."nvidia-drm.ko"]
copy-source = "__NVIDIA_MODULES__"

[nvidia-open-gpu-copy-only.kernel-modules."nvidia-peermem.ko"]
copy-source = "__NVIDIA_MODULES__"
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ After=load-tesla-kernel-modules.service load-open-gpu-kernel-modules.service

[Service]
Type=forking
ExecStart=__NVIDIA_BINDIR__/nvidia-persistenced --user nvidia --verbose
ExecStart=/usr/libexec/nvidia/tesla/bin/nvidia-persistenced --user nvidia --verbose

[Install]
RequiredBy=preconfigured.target
5 changes: 5 additions & 0 deletions packages/kmod-5.10-nvidia/nvidia-tesla-tmpfiles.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
C /etc/drivers/nvidia-tesla.toml
C /etc/drivers/nvidia-open-gpu.toml
C /etc/drivers/nvidia-open-gpu-copy-only.toml
C /etc/containerd/nvidia.env - - - - /usr/share/factory/nvidia/tesla/nvidia-path.env
C /etc/ld.so.conf.d/nvidia-tesla.conf
3 changes: 0 additions & 3 deletions packages/kmod-5.10-nvidia/nvidia-tesla-tmpfiles.conf.in

This file was deleted.

6 changes: 5 additions & 1 deletion packages/kmod-5.10-nvidia/nvidia-tmpfiles.conf.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/tesla - - - - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/tesla 0755 root root - -
D /var/run/nvidia-persistenced 0755 nvidia nvidia - -
R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/open-gpu - - - - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/open-gpu 0755 root root - -
C /etc/nvidia/fabricmanager.cfg - - - -
d /run/nvidia 0700 root root -
D /var/run/nvidia-persistenced 0755 nvidia nvidia - -

0 comments on commit e781c61

Please sign in to comment.