Skip to content

Commit

Permalink
[Core] Allow disabling ECC for nvidia-gpu (#3676)
Browse files Browse the repository at this point in the history
* Disable ECC for nvidia-gpu

* Add config.rst

* format

* address

* Note for the reboot overhead

* address comments

* fix fluidstack

* Avoid disable ecc for clouds using ray autoscaler due to the lack of retry after reboot
  • Loading branch information
Michaelvll authored Jun 21, 2024
1 parent a0a83e6 commit 3436b8c
Show file tree
Hide file tree
Showing 14 changed files with 87 additions and 8 deletions.
19 changes: 18 additions & 1 deletion docs/source/reference/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,24 @@ Available fields and semantics:
- gcp
- kubernetes
nvidia_gpus:
# Disable ECC for NVIDIA GPUs (optional).
#
# Set to true to disable ECC for NVIDIA GPUs during provisioning. This is
# useful to improve the GPU performance in some cases (up to 30%
# improvement). This will only be applied if a cluster is requested with
# NVIDIA GPUs. This is best-effort -- not guaranteed to work on all clouds
# e.g., RunPod and Kubernetes does not allow rebooting the node, though
# RunPod has ECC disabled by default.
#
# Note: this setting will cause a reboot during the first provisioning of
# the cluster, which may take a few minutes.
#
# Reference: https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW
#
# Default: false.
disable_ecc: false
# Advanced AWS configurations (optional).
# Apply to all new instances but not existing ones.
aws:
Expand Down Expand Up @@ -462,4 +480,3 @@ Available fields and semantics:
us-ashburn-1:
vcn_subnet: ocid1.subnet.oc1.iad.aaaaaaaafbj7i3aqc4ofjaapa5edakde6g4ea2yaslcsay32cthp7qo55pxa
6 changes: 6 additions & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,10 @@ def write_cluster_config(

# Use a tmp file path to avoid incomplete YAML file being re-used in the
# future.
initial_setup_commands = []
if (skypilot_config.get_nested(('nvidia_gpus', 'disable_ecc'), False) and
to_provision.accelerators is not None):
initial_setup_commands.append(constants.DISABLE_GPU_ECC_COMMAND)
tmp_yaml_path = yaml_path + '.tmp'
common_utils.fill_template(
cluster_config_template,
Expand Down Expand Up @@ -906,6 +910,8 @@ def write_cluster_config(
# currently only used by GCP.
'specific_reservations': specific_reservations,

# Initial setup commands.
'initial_setup_commands': initial_setup_commands,
# Conda setup
'conda_installation_commands':
constants.CONDA_INSTALLATION_COMMANDS,
Expand Down
2 changes: 1 addition & 1 deletion sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
for cloud in clouds:
try:
cloud_module = importlib.import_module(
f'sky.clouds.service_catalog.{cloud}_catalog')
f'sky.clouds.service_catalog.{cloud.lower()}_catalog')
except ModuleNotFoundError:
raise ValueError(
'Cannot find module "sky.clouds.service_catalog'
Expand Down
2 changes: 1 addition & 1 deletion sky/provision/fluidstack/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
def get_internal_ip(node_info: Dict[str, Any]) -> None:
node_info['internal_ip'] = node_info['ip_address']
runner = command_runner.SSHCommandRunner(
node_info['ip_address'],
(node_info['ip_address'], 22),
ssh_user=node_info['capabilities']['default_user_name'],
ssh_private_key=auth.PRIVATE_SSH_KEY_PATH)
result = runner.run(_GET_INTERNAL_IP_CMD,
Expand Down
20 changes: 20 additions & 0 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,26 @@
DOCKER_SERVER_ENV_VAR,
}

# Commands for disable GPU ECC, which can improve the performance of the GPU
# for some workloads by 30%. This will only be applied when a user specify
# `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml.
# Running this command will reboot the machine, introducing overhead for
# provisioning the machine.
# https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW
DISABLE_GPU_ECC_COMMAND = (
# Check if the GPU ECC is enabled. We use `sudo which` to check nvidia-smi
# because in some environments, nvidia-smi is not in path for sudo and we
# should skip disabling ECC in this case.
'sudo which nvidia-smi && echo "Checking Nvidia ECC Mode" && '
'out=$(nvidia-smi -q | grep "ECC Mode" -A2) && '
'echo "$out" && echo "$out" | grep Current | grep Enabled && '
'echo "Disabling Nvidia ECC" && '
# Disable the GPU ECC.
'sudo nvidia-smi -e 0 && '
# Reboot the machine to apply the changes.
'{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
'|| true; ')

# Install conda on the remote cluster if it is not already installed.
# We use conda with python 3.10 to be consistent across multiple clouds with
# best effort.
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ setup_commands:
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- mkdir -p ~/.ssh; touch ~/.ssh/config;
{%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
{{ conda_installation_commands }}
conda config --remove channels "https://aws-ml-conda-ec2.s3.us-west-2.amazonaws.com" || true;
{{ ray_skypilot_installation_commands }}
Expand Down
5 changes: 4 additions & 1 deletion sky/templates/cudo-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,10 @@ setup_commands:
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- sudo systemctl stop unattended-upgrades || true;
- {%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
Expand Down
5 changes: 4 additions & 1 deletion sky/templates/fluidstack-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ setup_commands:
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- sudo systemctl stop unattended-upgrades || true;
- {%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ setup_commands:
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
{%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
{%- if docker_image is none %}
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
Expand Down
3 changes: 3 additions & 0 deletions sky/templates/kubernetes-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,9 @@ setup_commands:
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- sudo DEBIAN_FRONTEND=noninteractive apt install gcc patch pciutils rsync fuse curl -y;
mkdir -p ~/.ssh; touch ~/.ssh/config;
{%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
{{ conda_installation_commands }}
{{ ray_skypilot_installation_commands }}
sudo touch ~/.sudo_as_admin_successful;
Expand Down
5 changes: 4 additions & 1 deletion sky/templates/paperspace-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ setup_commands:
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- sudo systemctl stop unattended-upgrades || true;
- {%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
Expand Down
5 changes: 4 additions & 1 deletion sky/templates/runpod-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@ setup_commands:
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- sudo systemctl stop unattended-upgrades || true;
- {%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
Expand Down
5 changes: 4 additions & 1 deletion sky/templates/vsphere-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ setup_commands:
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- sudo systemctl stop unattended-upgrades || true;
- {%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
Expand Down
12 changes: 12 additions & 0 deletions sky/utils/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,17 @@ def get_config_schema():
}
}

gpu_configs = {
'type': 'object',
'required': [],
'additionalProperties': False,
'properties': {
'disable_ecc': {
'type': 'boolean',
},
}
}

for cloud, config in cloud_configs.items():
if cloud == 'aws':
config['properties'].update(_REMOTE_IDENTITY_SCHEMA_AWS)
Expand All @@ -774,6 +785,7 @@ def get_config_schema():
'spot': controller_resources_schema,
'serve': controller_resources_schema,
'allowed_clouds': allowed_clouds,
'nvidia_gpus': gpu_configs,
**cloud_configs,
},
# Avoid spot and jobs being present at the same time.
Expand Down

0 comments on commit 3436b8c

Please sign in to comment.