-
Notifications
You must be signed in to change notification settings - Fork 177
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding a v6e network optimization daemonset to ai-on-gke repository. (#…
…805)
- Loading branch information
1 parent
daf6d32
commit 333765c
Showing
1 changed file
with
172 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
apiVersion: apps/v1 | ||
kind: DaemonSet | ||
metadata: | ||
name: tpu-network-optimization | ||
namespace: kube-system | ||
labels: | ||
k8s-app: tpu-network-optimization | ||
spec: | ||
selector: | ||
matchLabels: | ||
k8s-app: tpu-network-optimization | ||
template: | ||
metadata: | ||
labels: | ||
k8s-app: tpu-network-optimization | ||
spec: | ||
priorityClassName: system-node-critical | ||
# hostNetwork: true prevents a pod IP from being allocated to this pod, which can help with IP space utilization. | ||
hostNetwork: true | ||
hostPID: true | ||
affinity: | ||
nodeAffinity: | ||
requiredDuringSchedulingIgnoredDuringExecution: | ||
nodeSelectorTerms: | ||
- matchExpressions: | ||
- key: cloud.google.com/gke-tpu-accelerator | ||
operator: In | ||
values: | ||
- tpu-v6e-slice | ||
tolerations: | ||
- operator: "Exists" | ||
effect: "NoExecute" | ||
- operator: "Exists" | ||
effect: "NoSchedule" | ||
initContainers: | ||
- name: "tpu-network-optimization" | ||
image: "ubuntu:latest" | ||
securityContext: | ||
privileged: true | ||
command: | ||
- bash | ||
- -c | ||
- | | ||
#!/bin/bash | ||
# returns 0 (success) if it's running on a v6e VM. | ||
is_v6etpu_platform() { | ||
local machine_type | ||
machine_type=$(curl -H "Metadata-Flavor: Google" \ | ||
http://169.254.169.254/computeMetadata/v1/instance/machine-type) | ||
echo "machine_type: $machine_type" | ||
# Non-v6 TPUs are exempt | ||
[[ "$machine_type" == *"ct6e"* ]] || return 1 | ||
return 0 | ||
} | ||
if ! is_v6etpu_platform; then | ||
echo "Not a v6e TPU platform" | ||
exit 0 | ||
fi | ||
echo "Running on a v6e TPU platform" | ||
# This must be a v6e platform. Continue with v6e-specific network tunings. | ||
# PART 1: IRQ SPREADING. If this VM has multiple vnics, we need to make sure | ||
# they're using different sets of cores for interrupt handling. | ||
# Used to wrap around to the first core if we run out of cores. We limit | ||
# ourselves to node 0, and avoid hyperbuddies. | ||
node0_cores=$(echo /sys/devices/system/node/node0/cpu[0-9]* | wc -w) | ||
ht_buddies=$(cat /sys/devices/system/cpu/cpu0/topology/core_cpus_list | tr ',' ' ' | wc -w) | ||
total_schedulable_cores=$((node0_cores / ht_buddies)) | ||
core=0 | ||
for nic in $(ls -1 /sys/class/net); | ||
do | ||
echo "Updating interrupt cores for $nic" | ||
if [[ -d "/sys/class/net/$nic/device" ]]; then | ||
# ASSUMPTION: There are an equal number of TX and RX queues. | ||
NUM_QUEUES=$(echo /sys/class/net/"$nic"/queues/tx* | wc -w) | ||
# Helpers to figure out where to write the soft IRQ affinities. See functions | ||
# gve_tx_idx_to_ntfy and gve_rx_idx_to_ntfy. Notify blocks are allocated here: | ||
# https://github.com/GoogleCloudPlatform/compute-virtual-ethernet-linux/blob/1b4fe3f70e982b49507bc6fad865c23c9d22cc30/google/gve/gve_main.c#L394 | ||
# The bash here counts how many notify blocks there are, then identifies the | ||
# base block for TX and RX in identical fashion to the GVE functions. | ||
# TODO: Consider the case of if IRQ entries are not contiguous. | ||
base_ntfy_block=$(ls -1 /sys/class/net/"${nic}"/device/msi_irqs | sort -n | head -n 1) | ||
num_ntfy_blocks=$(ls /sys/class/net/"${nic}"/device/msi_irqs/ | wc -w) | ||
tx_irq_base_directory=$((base_ntfy_block)) | ||
rx_irq_base_directory=$((base_ntfy_block + (num_ntfy_blocks / 2))) | ||
for ((queue = 0; queue < $NUM_QUEUES; queue++)); do | ||
echo $core > /proc/irq/$((tx_irq_base_directory + $queue))/smp_affinity_list | ||
echo $core > /proc/irq/$((rx_irq_base_directory + $queue))/smp_affinity_list | ||
# Also set XPS affinity for the TX queue to the same core. | ||
cp /proc/irq/$((tx_irq_base_directory + $queue))/smp_affinity /sys/class/net/"$nic"/queues/tx-"$queue"/xps_cpus | ||
core=$((core + 1)) | ||
# Wrap around to the first core if we run out of cores. | ||
if [[ $core -ge $total_schedulable_cores ]]; then | ||
core=0 | ||
fi | ||
done | ||
echo "$nic cores:" | ||
for ((queue = 0; queue < $NUM_QUEUES; queue++)); do | ||
echo "queue $queue" | ||
irq=$((tx_irq_base_directory + $queue)) | ||
cat /proc/irq/$irq/smp_affinity_list | ||
irq=$((rx_irq_base_directory + $queue)) | ||
cat /proc/irq/$irq/smp_affinity_list | ||
done | ||
fi | ||
done | ||
# PART 2: TCP tunings. | ||
# Disable metrics cache | ||
sysctl -w net.ipv4.tcp_no_metrics_save=1 | ||
# Disable slow start after idle | ||
sysctl -w net.ipv4.tcp_slow_start_after_idle=0 | ||
# Set rto_min 5ms and enable quickack | ||
sysctl_rto_min_exists=$(sudo sysctl -a | grep tcp_rto_min_us) | ||
if [[ -z "$sysctl_rto_min_exists" ]]; then | ||
ip route show | while IFS= read -r route; do | ||
if ! echo "$route" | grep -q "linkdown"; then | ||
ip route change ${route/lock/} rto_min 5ms quickack 1 | ||
fi | ||
done | ||
else | ||
sysctl -w net.ipv4.tcp_rto_min_us=5000 | ||
ip route show | while IFS= read -r route; do | ||
if ! echo "$route" | grep -q "linkdown"; then | ||
ip route change ${route/lock/} quickack 1 | ||
fi | ||
done | ||
fi | ||
# Increase TCP zerocopy control memory | ||
sysctl -w net.core.optmem_max=131072 | ||
# Disable Cubic Hystart Ack-Train | ||
echo 2 > /sys/module/tcp_cubic/parameters/hystart_detect | ||
# PART 3: Larger gve buffers. | ||
echo "Enabling max rx buffer size for v6e " | ||
for nic in $(ls /sys/class/net); do | ||
if [[ -d "/sys/class/net/$nic/device" ]]; then | ||
if ethtool --set-priv-flags "$nic" enable-max-rx-buffer-size on; then | ||
echo "Max RX buffer size enabled for $nic" | ||
else | ||
echo "Unable to enable max RX buffer size for $nic" | ||
fi | ||
fi | ||
done | ||
# The script cannot return an error status. | ||
exit 0 | ||
volumeMounts: | ||
- mountPath: / | ||
name: root | ||
volumes: | ||
- name: root | ||
hostPath: | ||
path: / | ||
type: Directory | ||
containers: | ||
- image: "pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830" | ||
name: pause |