From c55597216125b2299704f689961659b099fb5421 Mon Sep 17 00:00:00 2001 From: Soumya Pani Date: Mon, 6 Nov 2023 09:43:41 -0800 Subject: [PATCH 1/2] Updating RxDM and nccl-plugin to LKG versions. --- .../templates/aiinfra_startup_scripts.yaml.template | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/a3/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_startup_scripts.yaml.template b/a3/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_startup_scripts.yaml.template index 09fac748d..0897b7df0 100644 --- a/a3/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_startup_scripts.yaml.template +++ b/a3/terraform/modules/cluster/mig-cos/cloudinit/templates/aiinfra_startup_scripts.yaml.template @@ -30,6 +30,7 @@ docker run --pull=always --rm \ --name receive-datapath-manager \ --detach \ + --privileged \ --cap-add=NET_ADMIN --network=host \ --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \ $${device_flags} \ @@ -38,7 +39,7 @@ --env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \ --volume /run/tcpx:/run/tcpx \ --entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \ - us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd \ + us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.7 \ --gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" echo 'Installing iptable rules...' @@ -47,7 +48,7 @@ echo 'Configuring NCCL and GPUDirectTCPX plugin...' docker run --rm \ --volume /var/lib:/var/lib \ - us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx \ + us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.6_2023_10_06 \ install --install-nccl mount --bind /var/lib/tcpx /var/lib/tcpx mount -o remount,exec /var/lib/tcpx From ceee1381a84cce6097944561ee5345be38f12d66 Mon Sep 17 00:00:00 2001 From: Soumya Pani Date: Mon, 6 Nov 2023 11:44:10 -0800 Subject: [PATCH 2/2] Updating release version number. --- cloudbuild-continuous.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild-continuous.yaml b/cloudbuild-continuous.yaml index 436b5170f..0f9472892 100644 --- a/cloudbuild-continuous.yaml +++ b/cloudbuild-continuous.yaml @@ -46,4 +46,4 @@ images: timeout: 5400s substitutions: - _VERSION: 'v1.4.0' + _VERSION: 'v1.4.1'