From 3436b8cad511560d99251e55eb3996cf4f976193 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 21 Jun 2024 15:27:08 -0700 Subject: [PATCH] [Core] Allow disabling ECC for nvidia-gpu (#3676) * Disable ECC for nvidia-gpu * Add config.rst * format * address * Note for the reboot overhead * address comments * fix fluidstack * Avoid disable ecc for clouds using ray autoscaler due to the lack of retry after reboot --- docs/source/reference/config.rst | 19 ++++++++++++++++++- sky/backends/backend_utils.py | 6 ++++++ sky/clouds/service_catalog/__init__.py | 2 +- sky/provision/fluidstack/instance.py | 2 +- sky/skylet/constants.py | 20 ++++++++++++++++++++ sky/templates/aws-ray.yml.j2 | 3 +++ sky/templates/cudo-ray.yml.j2 | 5 ++++- sky/templates/fluidstack-ray.yml.j2 | 5 ++++- sky/templates/gcp-ray.yml.j2 | 3 +++ sky/templates/kubernetes-ray.yml.j2 | 3 +++ sky/templates/paperspace-ray.yml.j2 | 5 ++++- sky/templates/runpod-ray.yml.j2 | 5 ++++- sky/templates/vsphere-ray.yml.j2 | 5 ++++- sky/utils/schemas.py | 12 ++++++++++++ 14 files changed, 87 insertions(+), 8 deletions(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 74cd2c01092..96be48e71e3 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -40,6 +40,24 @@ Available fields and semantics: - gcp - kubernetes + nvidia_gpus: + # Disable ECC for NVIDIA GPUs (optional). + # + # Set to true to disable ECC for NVIDIA GPUs during provisioning. This is + # useful to improve the GPU performance in some cases (up to 30% + # improvement). This will only be applied if a cluster is requested with + # NVIDIA GPUs. This is best-effort -- not guaranteed to work on all clouds + # e.g., RunPod and Kubernetes does not allow rebooting the node, though + # RunPod has ECC disabled by default. + # + # Note: this setting will cause a reboot during the first provisioning of + # the cluster, which may take a few minutes. + # + # Reference: https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW + # + # Default: false. + disable_ecc: false + # Advanced AWS configurations (optional). # Apply to all new instances but not existing ones. aws: @@ -462,4 +480,3 @@ Available fields and semantics: us-ashburn-1: vcn_subnet: ocid1.subnet.oc1.iad.aaaaaaaafbj7i3aqc4ofjaapa5edakde6g4ea2yaslcsay32cthp7qo55pxa - diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 03f644930f4..0989a3f9122 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -875,6 +875,10 @@ def write_cluster_config( # Use a tmp file path to avoid incomplete YAML file being re-used in the # future. + initial_setup_commands = [] + if (skypilot_config.get_nested(('nvidia_gpus', 'disable_ecc'), False) and + to_provision.accelerators is not None): + initial_setup_commands.append(constants.DISABLE_GPU_ECC_COMMAND) tmp_yaml_path = yaml_path + '.tmp' common_utils.fill_template( cluster_config_template, @@ -906,6 +910,8 @@ def write_cluster_config( # currently only used by GCP. 'specific_reservations': specific_reservations, + # Initial setup commands. + 'initial_setup_commands': initial_setup_commands, # Conda setup 'conda_installation_commands': constants.CONDA_INSTALLATION_COMMANDS, diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 7479cd77cf7..acc6fa0aa8b 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -35,7 +35,7 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs): for cloud in clouds: try: cloud_module = importlib.import_module( - f'sky.clouds.service_catalog.{cloud}_catalog') + f'sky.clouds.service_catalog.{cloud.lower()}_catalog') except ModuleNotFoundError: raise ValueError( 'Cannot find module "sky.clouds.service_catalog' diff --git a/sky/provision/fluidstack/instance.py b/sky/provision/fluidstack/instance.py index b37519a8458..e870ff15e0c 100644 --- a/sky/provision/fluidstack/instance.py +++ b/sky/provision/fluidstack/instance.py @@ -26,7 +26,7 @@ def get_internal_ip(node_info: Dict[str, Any]) -> None: node_info['internal_ip'] = node_info['ip_address'] runner = command_runner.SSHCommandRunner( - node_info['ip_address'], + (node_info['ip_address'], 22), ssh_user=node_info['capabilities']['default_user_name'], ssh_private_key=auth.PRIVATE_SSH_KEY_PATH) result = runner.run(_GET_INTERNAL_IP_CMD, diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 52754f3052c..bfec3ad8cac 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -98,6 +98,26 @@ DOCKER_SERVER_ENV_VAR, } +# Commands for disable GPU ECC, which can improve the performance of the GPU +# for some workloads by 30%. This will only be applied when a user specify +# `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml. +# Running this command will reboot the machine, introducing overhead for +# provisioning the machine. +# https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW +DISABLE_GPU_ECC_COMMAND = ( + # Check if the GPU ECC is enabled. We use `sudo which` to check nvidia-smi + # because in some environments, nvidia-smi is not in path for sudo and we + # should skip disabling ECC in this case. + 'sudo which nvidia-smi && echo "Checking Nvidia ECC Mode" && ' + 'out=$(nvidia-smi -q | grep "ECC Mode" -A2) && ' + 'echo "$out" && echo "$out" | grep Current | grep Enabled && ' + 'echo "Disabling Nvidia ECC" && ' + # Disable the GPU ECC. + 'sudo nvidia-smi -e 0 && ' + # Reboot the machine to apply the changes. + '{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } ' + '|| true; ') + # Install conda on the remote cluster if it is not already installed. # We use conda with python 3.10 to be consistent across multiple clouds with # best effort. diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 66c01f53617..778c64f6926 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -153,6 +153,9 @@ setup_commands: # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - mkdir -p ~/.ssh; touch ~/.ssh/config; + {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} {{ conda_installation_commands }} conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true; {{ ray_skypilot_installation_commands }} diff --git a/sky/templates/cudo-ray.yml.j2 b/sky/templates/cudo-ray.yml.j2 index f8f5c1cdc59..165e8fde2aa 100644 --- a/sky/templates/cudo-ray.yml.j2 +++ b/sky/templates/cudo-ray.yml.j2 @@ -54,7 +54,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/fluidstack-ray.yml.j2 b/sky/templates/fluidstack-ray.yml.j2 index a0f952a443f..309a5393828 100644 --- a/sky/templates/fluidstack-ray.yml.j2 +++ b/sky/templates/fluidstack-ray.yml.j2 @@ -55,7 +55,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index f4ec10a697d..42f1d179498 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -182,6 +182,9 @@ setup_commands: # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; }; + {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} {%- if docker_image is none %} sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index e4d39854ab5..20c35b15641 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -364,6 +364,9 @@ setup_commands: # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - sudo DEBIAN_FRONTEND=noninteractive apt install gcc patch pciutils rsync fuse curl -y; mkdir -p ~/.ssh; touch ~/.ssh/config; + {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} {{ conda_installation_commands }} {{ ray_skypilot_installation_commands }} sudo touch ~/.sudo_as_admin_successful; diff --git a/sky/templates/paperspace-ray.yml.j2 b/sky/templates/paperspace-ray.yml.j2 index ba0886ee679..005f30b5233 100644 --- a/sky/templates/paperspace-ray.yml.j2 +++ b/sky/templates/paperspace-ray.yml.j2 @@ -73,7 +73,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2 index 62206d1a85c..8c063ac4f5d 100644 --- a/sky/templates/runpod-ray.yml.j2 +++ b/sky/templates/runpod-ray.yml.j2 @@ -52,7 +52,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/vsphere-ray.yml.j2 b/sky/templates/vsphere-ray.yml.j2 index 7fc4cd9d01c..81c139d397d 100644 --- a/sky/templates/vsphere-ray.yml.j2 +++ b/sky/templates/vsphere-ray.yml.j2 @@ -51,7 +51,10 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - sudo systemctl stop unattended-upgrades || true; + - {%- for initial_setup_command in initial_setup_commands %} + {{ initial_setup_command }} + {%- endfor %} + sudo systemctl stop unattended-upgrades || true; sudo systemctl disable unattended-upgrades || true; sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 932f2075d21..97b46113da4 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -757,6 +757,17 @@ def get_config_schema(): } } + gpu_configs = { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'disable_ecc': { + 'type': 'boolean', + }, + } + } + for cloud, config in cloud_configs.items(): if cloud == 'aws': config['properties'].update(_REMOTE_IDENTITY_SCHEMA_AWS) @@ -774,6 +785,7 @@ def get_config_schema(): 'spot': controller_resources_schema, 'serve': controller_resources_schema, 'allowed_clouds': allowed_clouds, + 'nvidia_gpus': gpu_configs, **cloud_configs, }, # Avoid spot and jobs being present at the same time.