Adding a v6e network optimization daemonset to ai-on-gke repository. (#…

…805)
GoogleCloudPlatform · Sep 13, 2024 · 333765c · 333765c
1 parent daf6d32
commit 333765c
Showing 1 changed file with 172 additions and 0 deletions.
diff --git a/scripts/network-setup/v6e-network-optimization.yaml b/scripts/network-setup/v6e-network-optimization.yaml
@@ -0,0 +1,172 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: tpu-network-optimization
+  namespace: kube-system
+  labels:
+    k8s-app: tpu-network-optimization
+spec:
+  selector:
+    matchLabels:
+      k8s-app: tpu-network-optimization
+  template:
+    metadata:
+      labels:
+        k8s-app: tpu-network-optimization
+    spec:
+      priorityClassName: system-node-critical
+      # hostNetwork: true prevents a pod IP from being allocated to this pod, which can help with IP space utilization.
+      hostNetwork: true
+      hostPID: true
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: cloud.google.com/gke-tpu-accelerator
+                operator: In
+                values:
+                - tpu-v6e-slice
+      tolerations:
+      - operator: "Exists"
+        effect: "NoExecute"
+      - operator: "Exists"
+        effect: "NoSchedule"
+      initContainers:
+      - name: "tpu-network-optimization"
+        image: "ubuntu:latest"
+        securityContext:
+          privileged: true
+        command:
+        - bash
+        - -c
+        - |
+          #!/bin/bash
+
+          # returns 0 (success) if it's running on a v6e VM.
+          is_v6etpu_platform() {
+            local machine_type
+            machine_type=$(curl -H "Metadata-Flavor: Google" \
+              http://169.254.169.254/computeMetadata/v1/instance/machine-type)
+
+            echo "machine_type: $machine_type"
+            # Non-v6 TPUs are exempt
+            [[ "$machine_type" == *"ct6e"* ]] || return 1
+
+            return 0
+          }
+
+          if ! is_v6etpu_platform; then
+            echo "Not a v6e TPU platform"
+            exit 0
+          fi
+
+          echo "Running on a v6e TPU platform"
+          # This must be a v6e platform. Continue with v6e-specific network tunings.
+
+          # PART 1: IRQ SPREADING. If this VM has multiple vnics, we need to make sure
+          # they're using different sets of cores for interrupt handling.
+
+          # Used to wrap around to the first core if we run out of cores. We limit
+          # ourselves to node 0, and avoid hyperbuddies.
+          node0_cores=$(echo /sys/devices/system/node/node0/cpu[0-9]* | wc -w)
+          ht_buddies=$(cat /sys/devices/system/cpu/cpu0/topology/core_cpus_list | tr ',' ' ' | wc -w)
+          total_schedulable_cores=$((node0_cores / ht_buddies))
+
+          core=0
+          for nic in $(ls -1 /sys/class/net);
+          do
+            echo "Updating interrupt cores for $nic"
+            if [[ -d "/sys/class/net/$nic/device" ]]; then
+              # ASSUMPTION: There are an equal number of TX and RX queues.
+              NUM_QUEUES=$(echo /sys/class/net/"$nic"/queues/tx* | wc -w)
+              # Helpers to figure out where to write the soft IRQ affinities. See functions
+              # gve_tx_idx_to_ntfy and gve_rx_idx_to_ntfy. Notify blocks are allocated here:
+              # https://github.com/GoogleCloudPlatform/compute-virtual-ethernet-linux/blob/1b4fe3f70e982b49507bc6fad865c23c9d22cc30/google/gve/gve_main.c#L394
+              # The bash here counts how many notify blocks there are, then identifies the
+              # base block for TX and RX in identical fashion to the GVE functions.
+              # TODO: Consider the case of if IRQ entries are not contiguous.
+              base_ntfy_block=$(ls -1 /sys/class/net/"${nic}"/device/msi_irqs | sort -n | head -n 1)
+              num_ntfy_blocks=$(ls /sys/class/net/"${nic}"/device/msi_irqs/ | wc -w)
+              tx_irq_base_directory=$((base_ntfy_block))
+              rx_irq_base_directory=$((base_ntfy_block + (num_ntfy_blocks / 2)))
+
+              for ((queue = 0; queue < $NUM_QUEUES; queue++)); do
+                echo $core > /proc/irq/$((tx_irq_base_directory + $queue))/smp_affinity_list
+                echo $core > /proc/irq/$((rx_irq_base_directory + $queue))/smp_affinity_list
+                # Also set XPS affinity for the TX queue to the same core.
+                cp /proc/irq/$((tx_irq_base_directory + $queue))/smp_affinity /sys/class/net/"$nic"/queues/tx-"$queue"/xps_cpus
+                core=$((core + 1))
+                # Wrap around to the first core if we run out of cores.
+                if [[ $core -ge $total_schedulable_cores ]]; then
+                  core=0
+                fi
+              done
+              echo "$nic cores:"
+              for ((queue = 0; queue < $NUM_QUEUES; queue++)); do
+                echo "queue $queue"
+                irq=$((tx_irq_base_directory + $queue))
+                cat /proc/irq/$irq/smp_affinity_list
+                irq=$((rx_irq_base_directory + $queue))
+                cat /proc/irq/$irq/smp_affinity_list
+              done
+              fi
+          done
+
+          # PART 2: TCP tunings.
+
+          # Disable metrics cache
+          sysctl -w net.ipv4.tcp_no_metrics_save=1
+
+          # Disable slow start after idle
+          sysctl -w net.ipv4.tcp_slow_start_after_idle=0
+
+          # Set rto_min 5ms and enable quickack
+          sysctl_rto_min_exists=$(sudo sysctl -a | grep tcp_rto_min_us)
+          if [[ -z "$sysctl_rto_min_exists" ]]; then
+            ip route show | while IFS= read -r route; do
+              if ! echo "$route" | grep -q "linkdown"; then
+                ip route change ${route/lock/} rto_min 5ms quickack 1
+              fi
+            done
+          else
+            sysctl -w net.ipv4.tcp_rto_min_us=5000
+            ip route show | while IFS= read -r route; do
+              if ! echo "$route" | grep -q "linkdown"; then
+                ip route change ${route/lock/} quickack 1
+              fi
+            done
+          fi
+
+          # Increase TCP zerocopy control memory
+          sysctl -w net.core.optmem_max=131072
+
+          # Disable Cubic Hystart Ack-Train
+          echo 2 > /sys/module/tcp_cubic/parameters/hystart_detect
+
+          # PART 3: Larger gve buffers.
+
+          echo "Enabling max rx buffer size for v6e "
+          for nic in $(ls /sys/class/net); do
+            if [[ -d "/sys/class/net/$nic/device" ]]; then
+              if ethtool --set-priv-flags "$nic" enable-max-rx-buffer-size on; then
+                echo "Max RX buffer size enabled for $nic"
+              else
+                echo "Unable to enable max RX buffer size for $nic"
+              fi
+            fi
+          done
+
+          # The script cannot return an error status.
+          exit 0
+        volumeMounts:
+        - mountPath: /
+          name: root
+      volumes:
+        - name: root
+          hostPath:
+            path: /
+            type: Directory
+      containers:
+      - image: "pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
+        name: pause