From 5d68632ee979ee52aa9092d364679f298d8e43aa Mon Sep 17 00:00:00 2001 From: landscapepainter <34902420+landscapepainter@users.noreply.github.com> Date: Sun, 18 Aug 2024 14:29:02 -0700 Subject: [PATCH 01/12] [SkyServe][Test] Fix test_smoke.py::test_skyserve_new_autoscaler_update (#3824) * fix test_smoke.py::test_skyserve_new_autoscaler_update * nit * format * move out the comment from run at new_autoscaler_after.yaml * revert comment * revert templating * nit * nit --- tests/skyserve/update/new_autoscaler_after.yaml | 3 +-- tests/skyserve/update/new_autoscaler_before.yaml | 3 +-- tests/test_smoke.py | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/skyserve/update/new_autoscaler_after.yaml b/tests/skyserve/update/new_autoscaler_after.yaml index f5a2e552f67..2d12d3ef109 100644 --- a/tests/skyserve/update/new_autoscaler_after.yaml +++ b/tests/skyserve/update/new_autoscaler_after.yaml @@ -8,7 +8,6 @@ service: base_ondemand_fallback_replicas: 1 resources: - cloud: gcp ports: 8081 use_spot: true cpus: 2+ @@ -22,4 +21,4 @@ run: | # blue-green update. sleep 120 fi - python3 server.py + python3 server.py --port 8081 diff --git a/tests/skyserve/update/new_autoscaler_before.yaml b/tests/skyserve/update/new_autoscaler_before.yaml index a91c3cd230a..793221080ae 100644 --- a/tests/skyserve/update/new_autoscaler_before.yaml +++ b/tests/skyserve/update/new_autoscaler_before.yaml @@ -5,10 +5,9 @@ service: replicas: 2 resources: - cloud: gcp ports: 8081 cpus: 2+ workdir: examples/serve/http_server -run: python3 server.py +run: python3 server.py --port 8081 diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 33280e1fd4c..874e51e0f9c 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3952,7 +3952,7 @@ def test_skyserve_update_autoscale(generic_cloud: str): @pytest.mark.parametrize('mode', ['rolling', 'blue_green']) def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): """Test skyserve with update that changes autoscaler""" - name = _get_service_name() + mode + name = f'{_get_service_name()}-{mode}' wait_until_no_pending = ( f's=$(sky serve status {name}); echo "$s"; ' @@ -3982,7 +3982,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): _check_service_version(name, "1"), ] test = Test( - 'test-skyserve-new-autoscaler-update', + f'test-skyserve-new-autoscaler-update-{mode}', [ f'sky serve up -n {name} --cloud {generic_cloud} -y tests/skyserve/update/new_autoscaler_before.yaml', _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + From 0f8882e358fbbf3b97dbe2ef17291cd1e708cdd9 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Mon, 19 Aug 2024 19:54:04 +0800 Subject: [PATCH 02/12] [Catalog] Fix Azure catalog duplications (#3842) fix --- sky/clouds/service_catalog/data_fetchers/fetch_azure.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 9a7b2a90bee..bbd337e23aa 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -140,8 +140,12 @@ def get_pricing_df(region: Optional[str] = None) -> 'pd.DataFrame': print(f'Done fetching pricing {region}') df = pd.DataFrame(all_items) assert 'productName' in df.columns, (region, df.columns) - return df[(~df['productName'].str.contains(' Windows')) & - (df['unitPrice'] > 0)] + # Filter out the cloud services and windows products. + # Some H100 series use ' Win' instead of ' Windows', e.g. + # Virtual Machines NCCadsv5 Srs Win + return df[ + (~df['productName'].str.contains(' Win| Cloud Services| CloudServices')) + & (df['unitPrice'] > 0)] def get_sku_df(region_set: Set[str]) -> 'pd.DataFrame': From fffeacda56a96b71666724ddab13e89f90946943 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Mon, 19 Aug 2024 21:26:13 +0800 Subject: [PATCH 03/12] [Core][TPU] Support TPU V5 (#3814) * init * add v5p in europe-west4-b * reset tpu v5 default runtime version * resolve comments * add comment for pricing ingo * apply suggestions from code review * refactor comment --- .../data_fetchers/fetch_gcp.py | 151 ++++++++++++++++-- sky/resources.py | 15 +- 2 files changed, 152 insertions(+), 14 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 9578196b4eb..5b680500c75 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -54,6 +54,110 @@ ,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d """))) + +# TPU V5 is not visible in specific zones. We hardcode the missing zones here. +# NOTE(dev): Keep the zones and the df in sync. +TPU_V5_MISSING_ZONES_DF = { + 'europe-west4-b': pd.read_csv( + io.StringIO( + textwrap.dedent("""\ + AcceleratorName,AcceleratorCount,Region,AvailabilityZone + tpu-v5p-8,1,europe-west4,europe-west4-b + tpu-v5p-16,1,europe-west4,europe-west4-b + tpu-v5p-32,1,europe-west4,europe-west4-b + tpu-v5p-64,1,europe-west4,europe-west4-b + tpu-v5p-128,1,europe-west4,europe-west4-b + tpu-v5p-256,1,europe-west4,europe-west4-b + tpu-v5p-384,1,europe-west4,europe-west4-b + tpu-v5p-512,1,europe-west4,europe-west4-b + tpu-v5p-640,1,europe-west4,europe-west4-b + tpu-v5p-768,1,europe-west4,europe-west4-b + tpu-v5p-896,1,europe-west4,europe-west4-b + tpu-v5p-1024,1,europe-west4,europe-west4-b + tpu-v5p-1152,1,europe-west4,europe-west4-b + tpu-v5p-1280,1,europe-west4,europe-west4-b + tpu-v5p-1408,1,europe-west4,europe-west4-b + tpu-v5p-1536,1,europe-west4,europe-west4-b + tpu-v5p-1664,1,europe-west4,europe-west4-b + tpu-v5p-1792,1,europe-west4,europe-west4-b + tpu-v5p-1920,1,europe-west4,europe-west4-b + tpu-v5p-2048,1,europe-west4,europe-west4-b + tpu-v5p-2176,1,europe-west4,europe-west4-b + tpu-v5p-2304,1,europe-west4,europe-west4-b + tpu-v5p-2432,1,europe-west4,europe-west4-b + tpu-v5p-2560,1,europe-west4,europe-west4-b + tpu-v5p-2688,1,europe-west4,europe-west4-b + tpu-v5p-2816,1,europe-west4,europe-west4-b + tpu-v5p-2944,1,europe-west4,europe-west4-b + tpu-v5p-3072,1,europe-west4,europe-west4-b + tpu-v5p-3200,1,europe-west4,europe-west4-b + tpu-v5p-3328,1,europe-west4,europe-west4-b + tpu-v5p-3456,1,europe-west4,europe-west4-b + tpu-v5p-3584,1,europe-west4,europe-west4-b + tpu-v5p-3712,1,europe-west4,europe-west4-b + tpu-v5p-3840,1,europe-west4,europe-west4-b + tpu-v5p-3968,1,europe-west4,europe-west4-b + tpu-v5p-4096,1,europe-west4,europe-west4-b + tpu-v5p-4224,1,europe-west4,europe-west4-b + tpu-v5p-4352,1,europe-west4,europe-west4-b + tpu-v5p-4480,1,europe-west4,europe-west4-b + tpu-v5p-4608,1,europe-west4,europe-west4-b + tpu-v5p-4736,1,europe-west4,europe-west4-b + tpu-v5p-4864,1,europe-west4,europe-west4-b + tpu-v5p-4992,1,europe-west4,europe-west4-b + tpu-v5p-5120,1,europe-west4,europe-west4-b + tpu-v5p-5248,1,europe-west4,europe-west4-b + tpu-v5p-5376,1,europe-west4,europe-west4-b + tpu-v5p-5504,1,europe-west4,europe-west4-b + tpu-v5p-5632,1,europe-west4,europe-west4-b + tpu-v5p-5760,1,europe-west4,europe-west4-b + tpu-v5p-5888,1,europe-west4,europe-west4-b + tpu-v5p-6016,1,europe-west4,europe-west4-b + tpu-v5p-6144,1,europe-west4,europe-west4-b + tpu-v5p-6272,1,europe-west4,europe-west4-b + tpu-v5p-6400,1,europe-west4,europe-west4-b + tpu-v5p-6528,1,europe-west4,europe-west4-b + tpu-v5p-6656,1,europe-west4,europe-west4-b + tpu-v5p-6784,1,europe-west4,europe-west4-b + tpu-v5p-6912,1,europe-west4,europe-west4-b + tpu-v5p-7040,1,europe-west4,europe-west4-b + tpu-v5p-7168,1,europe-west4,europe-west4-b + tpu-v5p-7296,1,europe-west4,europe-west4-b + tpu-v5p-7424,1,europe-west4,europe-west4-b + tpu-v5p-7552,1,europe-west4,europe-west4-b + tpu-v5p-7680,1,europe-west4,europe-west4-b + tpu-v5p-7808,1,europe-west4,europe-west4-b + tpu-v5p-7936,1,europe-west4,europe-west4-b + tpu-v5p-8064,1,europe-west4,europe-west4-b + tpu-v5p-8192,1,europe-west4,europe-west4-b + tpu-v5p-8320,1,europe-west4,europe-west4-b + tpu-v5p-8448,1,europe-west4,europe-west4-b + tpu-v5p-8704,1,europe-west4,europe-west4-b + tpu-v5p-8832,1,europe-west4,europe-west4-b + tpu-v5p-8960,1,europe-west4,europe-west4-b + tpu-v5p-9216,1,europe-west4,europe-west4-b + tpu-v5p-9472,1,europe-west4,europe-west4-b + tpu-v5p-9600,1,europe-west4,europe-west4-b + tpu-v5p-9728,1,europe-west4,europe-west4-b + tpu-v5p-9856,1,europe-west4,europe-west4-b + tpu-v5p-9984,1,europe-west4,europe-west4-b + tpu-v5p-10240,1,europe-west4,europe-west4-b + tpu-v5p-10368,1,europe-west4,europe-west4-b + tpu-v5p-10496,1,europe-west4,europe-west4-b + tpu-v5p-10752,1,europe-west4,europe-west4-b + tpu-v5p-10880,1,europe-west4,europe-west4-b + tpu-v5p-11008,1,europe-west4,europe-west4-b + tpu-v5p-11136,1,europe-west4,europe-west4-b + tpu-v5p-11264,1,europe-west4,europe-west4-b + tpu-v5p-11520,1,europe-west4,europe-west4-b + tpu-v5p-11648,1,europe-west4,europe-west4-b + tpu-v5p-11776,1,europe-west4,europe-west4-b + tpu-v5p-11904,1,europe-west4,europe-west4-b + tpu-v5p-12032,1,europe-west4,europe-west4-b + tpu-v5p-12160,1,europe-west4,europe-west4-b + tpu-v5p-12288,1,europe-west4,europe-west4-b + """))) +} # FIXME(woosuk): Remove this once the bug is fixed. # See https://github.com/skypilot-org/skypilot/issues/1759#issue-1619614345 TPU_V4_HOST_DF = pd.read_csv( @@ -415,6 +519,12 @@ def get_gpu_price(row: pd.Series, spot: bool) -> Optional[float]: def _get_tpu_for_zone(zone: str) -> 'pd.DataFrame': + # Use hardcoded TPU V5 data as it is invisible in some zones. + missing_tpus_df = pd.DataFrame(columns=[ + 'AcceleratorName', 'AcceleratorCount', 'Region', 'AvailabilityZone' + ]) + if zone in TPU_V5_MISSING_ZONES_DF: + missing_tpus_df = TPU_V5_MISSING_ZONES_DF[zone] tpus = [] parent = f'projects/{project_id}/locations/{zone}' tpus_request = tpu_client.projects().locations().acceleratorTypes().list( @@ -432,16 +542,14 @@ def _get_tpu_for_zone(zone: str) -> 'pd.DataFrame': new_tpus = [] for tpu in tpus: tpu_name = tpu['type'] - # skip tpu v5 as we currently don't support it - if 'v5' in tpu_name: - continue new_tpus.append({ 'AcceleratorName': f'tpu-{tpu_name}', 'AcceleratorCount': 1, 'Region': zone.rpartition('-')[0], 'AvailabilityZone': zone, }) - return pd.DataFrame(new_tpus).reset_index(drop=True) + new_tpu_df = pd.DataFrame(new_tpus).reset_index(drop=True) + return pd.concat([new_tpu_df, missing_tpus_df]) def _get_tpus() -> 'pd.DataFrame': @@ -458,11 +566,22 @@ def _get_tpus() -> 'pd.DataFrame': # TODO: the TPUs fetched fails to contain us-east1 -def get_tpu_df(skus: List[Dict[str, Any]]) -> 'pd.DataFrame': +def get_tpu_df(gce_skus: List[Dict[str, Any]], + tpu_skus: List[Dict[str, Any]]) -> 'pd.DataFrame': df = _get_tpus() if df.empty: return df + def _get_tpu_description_str(tpu_version: str) -> str: + # TPU V5 has a different naming convention since it is contained in + # the GCE SKUs. v5p -> TpuV5p, v5litepod -> TpuV5e. + if tpu_version.startswith('v5'): + if tpu_version == 'v5p': + return 'TpuV5p' + assert tpu_version == 'v5litepod', tpu_version + return 'TpuV5e' + return f'Tpu-{tpu_version}' + def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]: assert row['AcceleratorCount'] == 1, row tpu_price = None @@ -475,9 +594,12 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]: # whether the TPU is a single device or a pod. # For TPU-v4, the pricing is uniform, and thus the pricing API # only provides the price of TPU-v4 pods. - is_pod = num_cores > 8 or tpu_version == 'v4' + # The price shown for v5 TPU is per chip hour, so there is no 'Pod' + # keyword in the description. + is_pod = ((num_cores > 8 or tpu_version == 'v4') and + not tpu_version.startswith('v5')) - for sku in skus: + for sku in gce_skus + tpu_skus: if tpu_region not in sku['serviceRegions']: continue description = sku['description'] @@ -489,7 +611,7 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]: if 'Preemptible' in description: continue - if f'Tpu-{tpu_version}' not in description: + if _get_tpu_description_str(tpu_version) not in description: continue if is_pod: if 'Pod' not in description: @@ -500,7 +622,15 @@ def get_tpu_price(row: pd.Series, spot: bool) -> Optional[float]: unit_price = _get_unit_price(sku) tpu_device_price = unit_price - tpu_core_price = tpu_device_price / 8 + # v5p naming convention is v$VERSION_NUMBERp-$CORES_COUNT, while + # v5e is v$VERSION_NUMBER-$CHIP_COUNT. In the same time, V5 price + # is shown as per chip price, which is 2 cores for v5p and 1 core + # for v5e. Reference here: + # https://cloud.google.com/tpu/docs/v5p#using-accelerator-type + # https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config + core_per_sku = (1 if tpu_version == 'v5litepod' else + 2 if tpu_version == 'v5p' else 8) + tpu_core_price = tpu_device_price / core_per_sku tpu_price = num_cores * tpu_core_price break @@ -546,7 +676,8 @@ def get_catalog_df(region_prefix: str) -> 'pd.DataFrame': region_prefix)] if not gpu_df.empty else gpu_df gcp_tpu_skus = get_skus(TPU_SERVICE_ID) - tpu_df = get_tpu_df(gcp_tpu_skus) + # TPU V5 SKU is not included in the TPU SKUs but in the GCE SKUs. + tpu_df = get_tpu_df(gcp_skus, gcp_tpu_skus) # Merge the dataframes. df = pd.concat([vm_df, gpu_df, tpu_df, TPU_V4_HOST_DF]) diff --git a/sky/resources.py b/sky/resources.py index f0cb1abda1e..2f19cd1aa01 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -578,10 +578,17 @@ def _set_accelerators( 'Cannot specify instance type' f' (got "{self.instance_type}") for TPU VM.') if 'runtime_version' not in accelerator_args: - if use_tpu_vm: - accelerator_args['runtime_version'] = 'tpu-vm-base' - else: - accelerator_args['runtime_version'] = '2.12.0' + + def _get_default_runtime_version() -> str: + if not use_tpu_vm: + return '2.12.0' + # TPU V5 requires a newer runtime version. + if acc.startswith('tpu-v5'): + return 'v2-alpha-tpuv5' + return 'tpu-vm-base' + + accelerator_args['runtime_version'] = ( + _get_default_runtime_version()) logger.info( 'Missing runtime_version in accelerator_args, using' f' default ({accelerator_args["runtime_version"]})') From 90de1b2564d16463a49f198199da4dc3e9540695 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 19 Aug 2024 16:50:53 -0700 Subject: [PATCH 04/12] [Docs] Fix imgur links (#3846) * Fix imgur links * Remove unecessary file * revert --- docs/source/examples/interactive-development.rst | 2 +- llm/codellama/README.md | 4 ++-- llm/falcon/README.md | 2 +- llm/gpt-2/README.md | 8 ++++---- llm/llama-2/README.md | 2 +- llm/llama-3/README.md | 4 ++-- llm/llama-3_1-finetuning/readme.md | 6 +++--- llm/lorax/README.md | 2 +- llm/vicuna-llama-2/README.md | 6 +++--- llm/vllm/README.md | 2 +- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/source/examples/interactive-development.rst b/docs/source/examples/interactive-development.rst index cc50f8e6ea8..40920934597 100644 --- a/docs/source/examples/interactive-development.rst +++ b/docs/source/examples/interactive-development.rst @@ -110,7 +110,7 @@ This is supported by simply connecting VSCode to the cluster with the cluster na For more details, please refer to the `VSCode documentation `__. -.. image:: https://imgur.com/8mKfsET.gif +.. image:: https://i.imgur.com/8mKfsET.gif :align: center :alt: Connect to the cluster with VSCode diff --git a/llm/codellama/README.md b/llm/codellama/README.md index 8e5025d22b5..f145fd062ff 100644 --- a/llm/codellama/README.md +++ b/llm/codellama/README.md @@ -10,14 +10,14 @@ The followings are the demos of Code Llama 70B hosted by SkyPilot Serve (aka Sky ## Demos
- +
Coding Assistant: Connect to hosted Code Llama with Tabby in VScode
- +
Chat: Connect to hosted Code Llama with FastChat
diff --git a/llm/falcon/README.md b/llm/falcon/README.md index 837e93f5558..6eb480d9ea8 100644 --- a/llm/falcon/README.md +++ b/llm/falcon/README.md @@ -50,7 +50,7 @@ sky launch -c falcon -s falcon.yaml --no-use-spot For reference, below is a loss graph you may expect to see, and the amount of time and the approximate cost of fine-tuning each of the models over 500 epochs (assuming a spot instance A100 GPU rate at $1.1 / hour and a A100-80GB rate of $1.61 / hour): -image +image 1. `ybelkada/falcon-7b-sharded-bf16`: 2.5 to 3 hours using 1 A100 spot GPU; total cost ≈ $3.3. diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index bc9893fec5b..10fa2cf6998 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -28,14 +28,14 @@ Run the following command to start GPT-2 (124M) training on a GPU VM with 8 A100 sky launch -c gpt2 gpt2.yaml ``` -![GPT-2 training with 8 A100 GPUs](https://imgur.com/v8SGpsF.png) +![GPT-2 training with 8 A100 GPUs](https://i.imgur.com/v8SGpsF.png) Or, you can train the model with a single A100, by adding `--gpus A100`: ```bash sky launch -c gpt2 gpt2.yaml --gpus A100 ``` -![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png) +![GPT-2 training with a single A100](https://i.imgur.com/hN65g4r.png) It is also possible to speed up the training of the model on 8 H100 (2.3x more tok/s than 8x A100s): @@ -43,7 +43,7 @@ It is also possible to speed up the training of the model on 8 H100 (2.3x more t sky launch -c gpt2 gpt2.yaml --gpus H100:8 ``` -![GPT-2 training with 8 H100](https://imgur.com/STbi80b.png) +![GPT-2 training with 8 H100](https://i.imgur.com/STbi80b.png) ### Download logs and visualizations @@ -54,7 +54,7 @@ scp -r gpt2:~/llm.c/log124M . We can visualize the training progress with the notebook provided in [llm.c](https://github.com/karpathy/llm.c/blob/master/dev/vislog.ipynb). (Note: we cut off the training after 10K steps, which already achieve similar validation loss as OpenAI GPT-2 checkpoint.)
- +
> Yes! We are able to reproduce the training of GPT-2 (124M) on any cloud with SkyPilot. diff --git a/llm/llama-2/README.md b/llm/llama-2/README.md index d8f8151572e..4f1a8f60cae 100644 --- a/llm/llama-2/README.md +++ b/llm/llama-2/README.md @@ -94,6 +94,6 @@ You can also host the official FAIR model without using huggingface and gradio. ``` 3. Open http://localhost:7681 in your browser and start chatting! -LLaMA chatbot running on the cloud via SkyPilot +LLaMA chatbot running on the cloud via SkyPilot diff --git a/llm/llama-3/README.md b/llm/llama-3/README.md index d0c28dc93c6..ef19d94b5c0 100644 --- a/llm/llama-3/README.md +++ b/llm/llama-3/README.md @@ -5,7 +5,7 @@

-Llama-3 x SkyPilot +Llama-3 x SkyPilot

[Llama-3](https://github.com/meta-llama/llama3) is the latest top open-source LLM from Meta. It has been released with a license that authorizes commercial use. You can deploy a private Llama-3 chatbot with SkyPilot in your own cloud with just one simple command. @@ -248,7 +248,7 @@ To use the Gradio UI, open the URL shown in the logs:

-Gradio UI serving Llama-3 +Gradio UI serving Llama-3

To stop the instance: diff --git a/llm/llama-3_1-finetuning/readme.md b/llm/llama-3_1-finetuning/readme.md index 836f3bf1b3b..935dccde84e 100644 --- a/llm/llama-3_1-finetuning/readme.md +++ b/llm/llama-3_1-finetuning/readme.md @@ -135,7 +135,7 @@ sky launch -c llama31 lora.yaml \
- +
Training Loss of LoRA finetuning Llama 3.1
@@ -218,10 +218,10 @@ run: | ## Appendix: Preparation 1. Request the access to [Llama 3.1 weights on huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) (Click on the blue box and follow the steps): -![](https://imgur.com/snIQhr9.png) +![](https://i.imgur.com/snIQhr9.png) 2. Get your [huggingface access token](https://huggingface.co/settings/tokens): -![](https://imgur.com/3idBgHn.png) +![](https://i.imgur.com/3idBgHn.png) 3. Add huggingface token to your environment variable: diff --git a/llm/lorax/README.md b/llm/lorax/README.md index 2fe548c92a8..6cc44cf1134 100644 --- a/llm/lorax/README.md +++ b/llm/lorax/README.md @@ -4,7 +4,7 @@

- LoRAX + LoRAX

[LoRAX](https://github.com/predibase/lorax) (LoRA eXchange) is a framework that allows users to serve thousands of fine-tuned LLMs on a single GPU, dramatically reducing the cost of serving without compromising on throughput or latency. It works by dynamically loading multiple fine-tuned "adapters" (LoRAs, etc.) on top of a single base model at runtime. Concurrent requests for different adapters can be processed together in a single batch, allowing LoRAX to maintain near linear throughput scaling as the number of adapters increases. diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md index 899792c299d..24caa525a56 100644 --- a/llm/vicuna-llama-2/README.md +++ b/llm/vicuna-llama-2/README.md @@ -1,6 +1,6 @@ # Train Your Own Vicuna on Llama-2 -![Vicuna-Llama-2](https://imgur.com/McZWg6z.gif "Result model in action, trained using this guide. From the SkyPilot and Vicuna teams.") +![Vicuna-Llama-2](https://i.imgur.com/McZWg6z.gif "Result model in action, trained using this guide. From the SkyPilot and Vicuna teams.") Meta released [Llama 2](https://ai.meta.com/llama/) two weeks ago and has made a big wave in the AI community. In our opinion, its biggest impact is that the model is now released under a [permissive license](https://github.com/facebookresearch/llama/blob/main/LICENSE) that **allows the model weights to be used commercially**[^1]. This differs from Llama 1 which cannot be used commercially. @@ -106,7 +106,7 @@ sky launch --no-use-spot ...

- Optimizer + Optimizer

**Optional**: Try out the training for the 13B model: @@ -139,7 +139,7 @@ sky launch -c serve serve.yaml --env MODEL_CKPT=/chatbot/ ``` In [serve.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna-llama-2/serve.yaml), we specified launching a Gradio server that serves the model checkpoint at `/chatbot/7b`. -![Vicuna-Llama-2](https://imgur.com/McZWg6z.gif "Serving the resulting model with Gradio.") +![Vicuna-Llama-2](https://i.imgur.com/McZWg6z.gif "Serving the resulting model with Gradio.") > **Tip**: You can also switch to a cheaper accelerator, such as L4, to save costs, by adding `--gpus L4` to the above command. diff --git a/llm/vllm/README.md b/llm/vllm/README.md index e3a2befbecc..9fb3c0c1364 100644 --- a/llm/vllm/README.md +++ b/llm/vllm/README.md @@ -4,7 +4,7 @@

- vLLM + vLLM

This README contains instructions to run a demo for vLLM, an open-source library for fast LLM inference and serving, which improves the throughput compared to HuggingFace by **up to 24x**. From 2691d4b7a7e21dc6624cc43928442d85f2f289ac Mon Sep 17 00:00:00 2001 From: landscapepainter <34902420+landscapepainter@users.noreply.github.com> Date: Mon, 19 Aug 2024 18:45:36 -0700 Subject: [PATCH 05/12] [Azure][Storage] Update default storage account naming with subscription id hash (#3796) * add subscription id hash on default storage account name * format * add comment * nit * update default storage account name with hashed region value * nit * format * nit * comment update * nit --- sky/data/storage.py | 53 ++++++++++++++++++++++++++++++++++++++++----- tests/test_smoke.py | 41 +++++++++++++++-------------------- 2 files changed, 64 insertions(+), 30 deletions(-) diff --git a/sky/data/storage.py b/sky/data/storage.py index f09d79ea48e..b915d1c6d54 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -1,5 +1,6 @@ """Storage and Store Classes for Sky Data.""" import enum +import hashlib import os import re import shlex @@ -1942,8 +1943,15 @@ class AzureBlobStore(AbstractStore): """Represents the backend for Azure Blob Storage Container.""" _ACCESS_DENIED_MESSAGE = 'Access Denied' - DEFAULT_STORAGE_ACCOUNT_NAME = 'sky{region}{user_hash}' DEFAULT_RESOURCE_GROUP_NAME = 'sky{user_hash}' + # Unlike resource group names, which only need to be unique within the + # subscription, storage account names must be globally unique across all of + # Azure users. Hence, the storage account name includes the subscription + # hash as well to ensure its uniqueness. + DEFAULT_STORAGE_ACCOUNT_NAME = ( + 'sky{region_hash}{user_hash}{subscription_hash}') + _SUBSCRIPTION_HASH_LENGTH = 4 + _REGION_HASH_LENGTH = 4 class AzureBlobStoreMetadata(AbstractStore.StoreMetadata): """A pickle-able representation of Azure Blob Store. @@ -1977,7 +1985,7 @@ def __init__(self, name: str, source: str, storage_account_name: str = '', - region: Optional[str] = None, + region: Optional[str] = 'eastus', is_sky_managed: Optional[bool] = None, sync_on_reconstruction: bool = True): self.storage_client: 'storage.Client' @@ -2156,6 +2164,41 @@ def initialize(self): # If is_sky_managed is specified, then we take no action. self.is_sky_managed = is_new_bucket + @staticmethod + def get_default_storage_account_name(region: Optional[str]) -> str: + """Generates a unique default storage account name. + + The subscription ID is included to avoid conflicts when user switches + subscriptions. The length of region_hash, user_hash, and + subscription_hash are adjusted to ensure the storage account name + adheres to the 24-character limit, as some region names can be very + long. Using a 4-character hash for the region helps keep the name + concise and prevents potential conflicts. + Reference: https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/resource-name-rules#microsoftstorage # pylint: disable=line-too-long + + Args: + region: Name of the region to create the storage account/container. + + Returns: + Name of the default storage account. + """ + assert region is not None + subscription_id = azure.get_subscription_id() + subscription_hash_obj = hashlib.md5(subscription_id.encode('utf-8')) + subscription_hash = subscription_hash_obj.hexdigest( + )[:AzureBlobStore._SUBSCRIPTION_HASH_LENGTH] + region_hash_obj = hashlib.md5(region.encode('utf-8')) + region_hash = region_hash_obj.hexdigest()[:AzureBlobStore. + _REGION_HASH_LENGTH] + + storage_account_name = ( + AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME.format( + region_hash=region_hash, + user_hash=common_utils.get_user_hash(), + subscription_hash=subscription_hash)) + + return storage_account_name + def _get_storage_account_and_resource_group( self) -> Tuple[str, Optional[str]]: """Get storage account and resource group to be used for AzureBlobStore @@ -2239,10 +2282,8 @@ def _get_storage_account_and_resource_group( else: # If storage account name is not provided from config, then # use default resource group and storage account names. - storage_account_name = ( - self.DEFAULT_STORAGE_ACCOUNT_NAME.format( - region=self.region, - user_hash=common_utils.get_user_hash())) + storage_account_name = self.get_default_storage_account_name( + self.region) resource_group_name = (self.DEFAULT_RESOURCE_GROUP_NAME.format( user_hash=common_utils.get_user_hash())) try: diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 874e51e0f9c..28eeeed5190 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -48,6 +48,7 @@ from sky import jobs from sky import serve from sky import skypilot_config +from sky.adaptors import azure from sky.adaptors import cloudflare from sky.adaptors import ibm from sky.clouds import AWS @@ -1103,9 +1104,8 @@ def test_azure_storage_mounts_with_stop(): cloud = 'azure' storage_name = f'sky-test-{int(time.time())}' default_region = 'eastus' - storage_account_name = ( - storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME.format( - region=default_region, user_hash=common_utils.get_user_hash())) + storage_account_name = (storage_lib.AzureBlobStore. + get_default_storage_account_name(default_region)) storage_account_key = data_utils.get_az_storage_account_key( storage_account_name) template_str = pathlib.Path( @@ -2990,8 +2990,7 @@ def test_managed_jobs_storage(generic_cloud: str): region = 'westus2' region_flag = f' --region {region}' storage_account_name = ( - storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME.format( - region=region, user_hash=common_utils.get_user_hash())) + storage_lib.AzureBlobStore.get_default_storage_account_name(region)) region_cmd = TestStorageWithCredentials.cli_region_cmd( storage_lib.StoreType.AZURE, storage_account_name=storage_account_name) @@ -4287,9 +4286,8 @@ def cli_delete_cmd(store_type, if store_type == storage_lib.StoreType.AZURE: default_region = 'eastus' storage_account_name = ( - storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME.format( - region=default_region, - user_hash=common_utils.get_user_hash())) + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) storage_account_key = data_utils.get_az_storage_account_key( storage_account_name) return ('az storage container delete ' @@ -4324,11 +4322,9 @@ def cli_ls_cmd(store_type, bucket_name, suffix=''): config_storage_account = skypilot_config.get_nested( ('azure', 'storage_account'), None) storage_account_name = config_storage_account if ( - config_storage_account is not None - ) else ( - storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME.format( - region=default_region, - user_hash=common_utils.get_user_hash())) + config_storage_account is not None) else ( + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) storage_account_key = data_utils.get_az_storage_account_key( storage_account_name) list_cmd = ('az storage blob list ' @@ -4390,9 +4386,8 @@ def cli_count_name_in_bucket(store_type, if storage_account_name is None: default_region = 'eastus' storage_account_name = ( - storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME. - format(region=default_region, - user_hash=common_utils.get_user_hash())) + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) storage_account_key = data_utils.get_az_storage_account_key( storage_account_name) return ('az storage blob list ' @@ -4418,9 +4413,8 @@ def cli_count_file_in_bucket(store_type, bucket_name): elif store_type == storage_lib.StoreType.AZURE: default_region = 'eastus' storage_account_name = ( - storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME.format( - region=default_region, - user_hash=common_utils.get_user_hash())) + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) storage_account_key = data_utils.get_az_storage_account_key( storage_account_name) return ('az storage blob list ' @@ -4622,8 +4616,8 @@ def tmp_az_bucket(self, tmp_bucket_name): # Creates a temporary bucket using gsutil default_region = 'eastus' storage_account_name = ( - storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME.format( - region=default_region, user_hash=common_utils.get_user_hash())) + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) storage_account_key = data_utils.get_az_storage_account_key( storage_account_name) bucket_uri = data_utils.AZURE_CONTAINER_URL.format( @@ -4859,9 +4853,8 @@ def test_nonexistent_bucket(self, nonexist_bucket_url): elif nonexist_bucket_url.startswith('https'): default_region = 'eastus' storage_account_name = ( - storage_lib.AzureBlobStore.DEFAULT_STORAGE_ACCOUNT_NAME. - format(region=default_region, - user_hash=common_utils.get_user_hash())) + storage_lib.AzureBlobStore.get_default_storage_account_name( + default_region)) storage_account_key = data_utils.get_az_storage_account_key( storage_account_name) command = f'az storage container exists --account-name {storage_account_name} --account-key {storage_account_key} --name {nonexist_bucket_name}' From 87e55ab961f0d10622006618fc6f65d3ce2d5beb Mon Sep 17 00:00:00 2001 From: Seth Kimmel Date: Tue, 20 Aug 2024 12:02:27 -0700 Subject: [PATCH 06/12] Update aws.rst (#3849) * Update aws.rst * Update docs/source/cloud-setup/cloud-permissions/aws.rst Co-authored-by: Zhanghao Wu --------- Co-authored-by: Zhanghao Wu --- docs/source/cloud-setup/cloud-permissions/aws.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/cloud-setup/cloud-permissions/aws.rst b/docs/source/cloud-setup/cloud-permissions/aws.rst index e34499df3b4..89510331988 100644 --- a/docs/source/cloud-setup/cloud-permissions/aws.rst +++ b/docs/source/cloud-setup/cloud-permissions/aws.rst @@ -148,7 +148,7 @@ AWS accounts can be attached with a policy that limits the permissions of the ac :align: center :alt: AWS Add Policy -8. **Optional**: If you would like to have your users access S3 buckets: You can additionally attach S3 access, such as the "AmazonS3FullAccess" policy. +8. **Optional**: If you would like to have your users access S3 buckets: You can additionally attach S3 access, such as the "AmazonS3FullAccess" policy. Note that enabling S3 access is required to use :ref:`managed-jobs` with `workdir` or `file_mounts` for now. .. image:: ../../images/screenshots/aws/aws-s3-policy.png :width: 80% From fa60f7ab533190099662c49731d1c0af13abd6b9 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 20 Aug 2024 17:01:28 -0700 Subject: [PATCH 07/12] [Docker] Add `--gpus all` for docker when GPU is available (#3833) * Add --gpus all for when GPU is available * Get rid of --gpus all --- sky/provision/docker_utils.py | 2 +- sky/templates/aws-ray.yml.j2 | 3 --- sky/templates/azure-ray.yml.j2 | 3 --- sky/templates/gcp-ray.yml.j2 | 3 --- sky/templates/paperspace-ray.yml.j2 | 3 --- 5 files changed, 1 insertion(+), 13 deletions(-) diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index aa29a3666a3..e989fbc085a 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -381,7 +381,7 @@ def _configure_runtime(self, run_options: List[str]) -> List[str]: if 'nvidia-container-runtime' in runtime_output: try: self._run('nvidia-smi', log_err_when_fail=False) - return run_options + ['--runtime=nvidia'] + return run_options + ['--runtime=nvidia', '--gpus all'] except Exception as e: # pylint: disable=broad-except logger.debug( 'Nvidia Container Runtime is present in the docker image' diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 0a6b0bcc08c..4cfecfe2b12 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -11,9 +11,6 @@ docker: container_name: {{docker_container_name}} run_options: - --ulimit nofile=1048576:1048576 - {%- if custom_resources is not none %} - --gpus all - {%- endif %} {%- for run_option in docker_run_options %} - {{run_option}} {%- endfor %} diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 39672a976b8..65d500fc677 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -11,9 +11,6 @@ docker: container_name: {{docker_container_name}} run_options: - --ulimit nofile=1048576:1048576 - {%- if custom_resources is not none %} - --gpus all - {%- endif %} {%- for run_option in docker_run_options %} - {{run_option}} {%- endfor %} diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index d7e787953d9..bcc16bac531 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -12,9 +12,6 @@ docker: container_name: {{docker_container_name}} run_options: - --ulimit nofile=1048576:1048576 - {%- if gpu is not none %} - --gpus all - {%- endif %} {%- for run_option in docker_run_options %} - {{run_option}} {%- endfor %} diff --git a/sky/templates/paperspace-ray.yml.j2 b/sky/templates/paperspace-ray.yml.j2 index 400714978b9..8eea5ac4f8a 100644 --- a/sky/templates/paperspace-ray.yml.j2 +++ b/sky/templates/paperspace-ray.yml.j2 @@ -11,9 +11,6 @@ docker: container_name: {{docker_container_name}} run_options: - --ulimit nofile=1048576:1048576 - {%- if custom_resources is not none %} - --gpus all - {%- endif %} {%- for run_option in docker_run_options %} - {{run_option}} {%- endfor %} From 2ea613f89e7aee947117c9cb56e5d0ced3f0db94 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 20 Aug 2024 18:35:06 -0700 Subject: [PATCH 08/12] [Azure] Fix azure failover with `RoleAssigmentUpdateNotPermitted` (#3848) * Fix azure failover * Add unique id back --- sky/provision/azure/config.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sky/provision/azure/config.py b/sky/provision/azure/config.py index 146deaa6781..7b50c3d8c0f 100644 --- a/sky/provision/azure/config.py +++ b/sky/provision/azure/config.py @@ -3,6 +3,7 @@ Creates the resource group and deploys the configuration template to Azure for a cluster to be launched. """ +import hashlib import json import logging from pathlib import Path @@ -15,6 +16,7 @@ logger = logging.getLogger(__name__) +UNIQUE_ID_LEN = 4 _DEPLOYMENT_NAME = 'skypilot-config' _LEGACY_DEPLOYMENT_NAME = 'ray-config' _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480 # 8 minutes @@ -103,10 +105,12 @@ def bootstrap_instances( logger.info(f'Using cluster name: {cluster_name_on_cloud}') + hasher = hashlib.md5(provider_config['resource_group'].encode('utf-8')) + unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN] subnet_mask = provider_config.get('subnet_mask') if subnet_mask is None: # choose a random subnet, skipping most common value of 0 - random.seed(cluster_name_on_cloud) + random.seed(unique_id) subnet_mask = f'10.{random.randint(1, 254)}.0.0/16' logger.info(f'Using subnet mask: {subnet_mask}') @@ -119,10 +123,10 @@ def bootstrap_instances( 'value': subnet_mask }, 'clusterId': { - # We use the cluster name as the unique ID for the cluster, - # as we have already appended the user hash to the cluster - # name. - 'value': cluster_name_on_cloud + # We use the cluster name + resource group hash as the + # unique ID for the cluster, as we need to make sure that + # the deployments have unique names during failover. + 'value': f'{cluster_name_on_cloud}-{unique_id}' }, }, } From 6101a04198ad50cc04f68a8ca38ca98bfbd72954 Mon Sep 17 00:00:00 2001 From: mjibril Date: Wed, 21 Aug 2024 18:43:14 +0100 Subject: [PATCH 09/12] [FluidStack][API] Update FluidStack to new API (#3799) * [FluidStack][API] Update FluidStack to new API * Update FluidStack to new API * Simplify deployment by removing CUDA installation and using pre-built images Co-authored-by: Mubarak Jibril * fix format --- docs/source/getting-started/installation.rst | 5 +- sky/clouds/fluidstack.py | 59 ++--- .../data_fetchers/fetch_fluidstack.py | 218 ++++++++++++++---- sky/provision/fluidstack/fluidstack_utils.py | 175 +++++--------- sky/provision/fluidstack/instance.py | 48 +--- sky/templates/fluidstack-ray.yml.j2 | 1 - tests/test_smoke.py | 19 ++ 7 files changed, 281 insertions(+), 244 deletions(-) diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index be7ae1ff327..9f251a5aafe 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -301,13 +301,12 @@ RunPod Fluidstack ~~~~~~~~~~~~~~~~~~ -`Fluidstack `__ is a cloud provider offering low-cost GPUs. To configure Fluidstack access, go to the `Home `__ page on your Fluidstack console to generate an API key and then add the :code:`API key` to :code:`~/.fluidstack/api_key` and the :code:`API token` to :code:`~/.fluidstack/api_token`: - +`Fluidstack `__ is a cloud provider offering low-cost GPUs. To configure Fluidstack access, go to the `Home `__ page on your Fluidstack console to generate an API key and then add the :code:`API key` to :code:`~/.fluidstack/api_key` : .. code-block:: shell mkdir -p ~/.fluidstack echo "your_api_key_here" > ~/.fluidstack/api_key - echo "your_api_token_here" > ~/.fluidstack/api_token + Cudo Compute diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index d292ace02f8..ef397d4c55e 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -15,8 +15,7 @@ _CREDENTIAL_FILES = [ # credential files for FluidStack, - fluidstack_utils.FLUIDSTACK_API_KEY_PATH, - fluidstack_utils.FLUIDSTACK_API_TOKEN_PATH, + fluidstack_utils.FLUIDSTACK_API_KEY_PATH ] if typing.TYPE_CHECKING: # Renaming to avoid shadowing variables. @@ -189,20 +188,12 @@ def make_deploy_resources_variables( custom_resources = json.dumps(acc_dict, separators=(',', ':')) else: custom_resources = None - cuda_installation_commands = """ - sudo wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -O /usr/local/cuda-keyring_1.1-1_all.deb; - sudo dpkg -i /usr/local/cuda-keyring_1.1-1_all.deb; - sudo apt-get update; - sudo apt-get -y install cuda-toolkit-12-3; - sudo apt-get install -y cuda-drivers; - sudo apt-get install -y python3-pip; - nvidia-smi || sudo reboot;""" + return { 'instance_type': resources.instance_type, 'custom_resources': custom_resources, 'region': region.name, - 'fluidstack_username': self.default_username(region.name), - 'cuda_installation_commands': cuda_installation_commands, + 'fluidstack_username': 'ubuntu', } def _get_feasible_launchable_resources( @@ -270,17 +261,26 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: try: assert os.path.exists( os.path.expanduser(fluidstack_utils.FLUIDSTACK_API_KEY_PATH)) - assert os.path.exists( - os.path.expanduser(fluidstack_utils.FLUIDSTACK_API_TOKEN_PATH)) + + with open(os.path.expanduser( + fluidstack_utils.FLUIDSTACK_API_KEY_PATH), + encoding='UTF-8') as f: + api_key = f.read().strip() + if not api_key.startswith('api_key'): + return False, ('Invalid FluidStack API key format. ' + 'To configure credentials, go to:\n ' + ' https://dashboard.fluidstack.io \n ' + 'to obtain an API key, ' + 'then add save the contents ' + 'to ~/.fluidstack/api_key \n') except AssertionError: - return False, ( - 'Failed to access FluidStack Cloud' - ' with credentials. ' - 'To configure credentials, go to:\n ' - ' https://console.fluidstack.io \n ' - 'to obtain an API key and API Token, ' - 'then add save the contents ' - 'to ~/.fluidstack/api_key and ~/.fluidstack/api_token \n') + return False, ('Failed to access FluidStack Cloud' + ' with credentials. ' + 'To configure credentials, go to:\n ' + ' https://dashboard.fluidstack.io \n ' + 'to obtain an API key, ' + 'then add save the contents ' + 'to ~/.fluidstack/api_key \n') except requests.exceptions.ConnectionError: return False, ('Failed to verify FluidStack Cloud credentials. ' 'Check your network connection ' @@ -303,21 +303,6 @@ def validate_region_zone(self, region: Optional[str], zone: Optional[str]): zone, clouds='fluidstack') - @classmethod - def default_username(cls, region: str) -> str: - return { - 'norway_2_eu': 'ubuntu', - 'calgary_1_canada': 'ubuntu', - 'norway_3_eu': 'ubuntu', - 'norway_4_eu': 'ubuntu', - 'india_2': 'root', - 'nevada_1_usa': 'fsuser', - 'generic_1_canada': 'ubuntu', - 'iceland_1_eu': 'ubuntu', - 'new_york_1_usa': 'fsuser', - 'illinois_1_usa': 'fsuser' - }.get(region, 'ubuntu') - @classmethod def query_status( cls, diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py b/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py index 5d50399ab89..cf943541e08 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py @@ -11,10 +11,140 @@ import requests -ENDPOINT = 'https://api.fluidstack.io/v1/plans' +ENDPOINT = 'https://platform.fluidstack.io/list_available_configurations' DEFAULT_FLUIDSTACK_API_KEY_PATH = os.path.expanduser('~/.fluidstack/api_key') -DEFAULT_FLUIDSTACK_API_TOKEN_PATH = os.path.expanduser( - '~/.fluidstack/api_token') + +plan_vcpus_memory = [{ + 'gpu_type': 'RTX_A6000_48GB', + 'gpu_count': 2, + 'min_cpu_count': 12, + 'min_memory': 110.0 +}, { + 'gpu_type': 'RTX_A6000_48GB', + 'gpu_count': 4, + 'min_cpu_count': 24, + 'min_memory': 220.0 +}, { + 'gpu_type': 'A100_NVLINK_80GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 960.0 +}, { + 'gpu_type': 'H100_PCIE_80GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 1440.0 +}, { + 'gpu_type': 'RTX_A4000_16GB', + 'gpu_count': 2, + 'min_cpu_count': 12, + 'min_memory': 48.0 +}, { + 'gpu_type': 'H100_PCIE_80GB', + 'gpu_count': 2, + 'min_cpu_count': 60, + 'min_memory': 360.0 +}, { + 'gpu_type': 'RTX_A6000_48GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 464.0 +}, { + 'gpu_type': 'H100_NVLINK_80GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 1440.0 +}, { + 'gpu_type': 'H100_PCIE_80GB', + 'gpu_count': 1, + 'min_cpu_count': 28, + 'min_memory': 180.0 +}, { + 'gpu_type': 'RTX_A5000_24GB', + 'gpu_count': 1, + 'min_cpu_count': 8, + 'min_memory': 30.0 +}, { + 'gpu_type': 'RTX_A5000_24GB', + 'gpu_count': 2, + 'min_cpu_count': 16, + 'min_memory': 60.0 +}, { + 'gpu_type': 'L40_48GB', + 'gpu_count': 2, + 'min_cpu_count': 64, + 'min_memory': 120.0 +}, { + 'gpu_type': 'RTX_A4000_16GB', + 'gpu_count': 8, + 'min_cpu_count': 48, + 'min_memory': 192.0 +}, { + 'gpu_type': 'RTX_A4000_16GB', + 'gpu_count': 1, + 'min_cpu_count': 6, + 'min_memory': 24.0 +}, { + 'gpu_type': 'RTX_A4000_16GB', + 'gpu_count': 4, + 'min_cpu_count': 24, + 'min_memory': 96.0 +}, { + 'gpu_type': 'A100_PCIE_80GB', + 'gpu_count': 4, + 'min_cpu_count': 124, + 'min_memory': 480.0 +}, { + 'gpu_type': 'H100_PCIE_80GB', + 'gpu_count': 4, + 'min_cpu_count': 124, + 'min_memory': 720.0 +}, { + 'gpu_type': 'L40_48GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 480.0 +}, { + 'gpu_type': 'RTX_A5000_24GB', + 'gpu_count': 8, + 'min_cpu_count': 64, + 'min_memory': 240.0 +}, { + 'gpu_type': 'L40_48GB', + 'gpu_count': 1, + 'min_cpu_count': 32, + 'min_memory': 60.0 +}, { + 'gpu_type': 'RTX_A6000_48GB', + 'gpu_count': 1, + 'min_cpu_count': 6, + 'min_memory': 55.0 +}, { + 'gpu_type': 'L40_48GB', + 'gpu_count': 4, + 'min_cpu_count': 126, + 'min_memory': 240.0 +}, { + 'gpu_type': 'A100_PCIE_80GB', + 'gpu_count': 1, + 'min_cpu_count': 28, + 'min_memory': 120.0 +}, { + 'gpu_type': 'A100_PCIE_80GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 1440.0 +}, { + 'gpu_type': 'A100_PCIE_80GB', + 'gpu_count': 2, + 'min_cpu_count': 60, + 'min_memory': 240.0 +}, { + 'gpu_type': 'RTX_A5000_24GB', + 'gpu_count': 4, + 'min_cpu_count': 32, + 'min_memory': 120.0 +}] GPU_MAP = { 'H100_PCIE_80GB': 'H100', @@ -47,19 +177,15 @@ def get_regions(plans: List) -> dict: regions = {} for plan in plans: for region in plan.get('regions', []): - regions[region['id']] = region['id'] + regions[region] = region return regions def create_catalog(output_dir: str) -> None: - response = requests.get(ENDPOINT) + with open(DEFAULT_FLUIDSTACK_API_KEY_PATH, 'r', encoding='UTF-8') as f: + api_key = f.read().strip() + response = requests.get(ENDPOINT, headers={'api-key': api_key}) plans = response.json() - #plans = [plan for plan in plans if len(plan['regions']) > 0] - plans = [ - plan for plan in plans if plan['minimum_commitment'] == 'hourly' and - plan['type'] in ['preconfigured'] and - plan['gpu_type'] not in ['NO GPU', 'RTX_3080_10GB', 'RTX_3090_24GB'] - ] with open(os.path.join(output_dir, 'vms.csv'), mode='w', encoding='utf-8') as f: @@ -81,39 +207,45 @@ def create_catalog(output_dir: str) -> None: except KeyError: #print(f'Could not map {plan["gpu_type"]}') continue - gpu_memory = int( - str(plan['configuration']['gpu_memory']).replace('GB', - '')) * 1024 - gpu_cnt = int(plan['configuration']['gpu_count']) - vcpus = float(plan['configuration']['core_count']) - mem = float(plan['configuration']['ram']) - price = float(plan['price']['hourly']) * gpu_cnt - gpuinfo = { - 'Gpus': [{ - 'Name': gpu, - 'Manufacturer': 'NVIDIA', - 'Count': gpu_cnt, - 'MemoryInfo': { - 'SizeInMiB': int(gpu_memory) - }, - }], - 'TotalGpuMemoryInMiB': int(gpu_memory * gpu_cnt), - } - gpuinfo = json.dumps(gpuinfo).replace('"', "'") # pylint: disable=invalid-string-quote - for r in plan.get('regions', []): - if r['id'] == 'india_2': + for gpu_cnt in plan['gpu_counts']: + gpu_memory = float(plan['gpu_type'].split('_')[-1].replace( + 'GB', '')) * 1024 + try: + vcpus_mem = [ + x for x in plan_vcpus_memory + if x['gpu_type'] == plan['gpu_type'] and + x['gpu_count'] == gpu_cnt + ][0] + vcpus = vcpus_mem['min_cpu_count'] + mem = vcpus_mem['min_memory'] + except IndexError: continue - writer.writerow([ - plan['plan_id'], - gpu, - gpu_cnt, - vcpus, - mem, - price, - r['id'], - gpuinfo, - '', - ]) + price = float(plan['price_per_gpu_hr']) * gpu_cnt + gpuinfo = { + 'Gpus': [{ + 'Name': gpu, + 'Manufacturer': 'NVIDIA', + 'Count': gpu_cnt, + 'MemoryInfo': { + 'SizeInMiB': int(gpu_memory) + }, + }], + 'TotalGpuMemoryInMiB': int(gpu_memory * gpu_cnt), + } + gpuinfo = json.dumps(gpuinfo).replace('"', "'") # pylint: disable=invalid-string-quote + instance_type = f'{plan["gpu_type"]}::{gpu_cnt}' + for region in plan.get('regions', []): + writer.writerow([ + instance_type, + gpu, + gpu_cnt, + vcpus, + mem, + price, + region, + gpuinfo, + '', + ]) if __name__ == '__main__': diff --git a/sky/provision/fluidstack/fluidstack_utils.py b/sky/provision/fluidstack/fluidstack_utils.py index ebc616c0bfc..a9efb865a3c 100644 --- a/sky/provision/fluidstack/fluidstack_utils.py +++ b/sky/provision/fluidstack/fluidstack_utils.py @@ -3,7 +3,8 @@ import functools import json import os -from typing import Any, Dict, List, Optional +import time +from typing import Any, Dict, List import uuid import requests @@ -13,9 +14,8 @@ def get_key_suffix(): return str(uuid.uuid4()).replace('-', '')[:8] -ENDPOINT = 'https://api.fluidstack.io/v1/' +ENDPOINT = 'https://platform.fluidstack.io/' FLUIDSTACK_API_KEY_PATH = '~/.fluidstack/api_key' -FLUIDSTACK_API_TOKEN_PATH = '~/.fluidstack/api_token' def read_contents(path: str) -> str: @@ -46,109 +46,76 @@ def raise_fluidstack_error(response: requests.Response) -> None: raise FluidstackAPIError(f'{message}', status_code) -@functools.lru_cache() -def with_nvidia_drivers(region: str): - if region in ['norway_4_eu', 'generic_1_canada']: - return False - client = FluidstackClient() - plans = client.get_plans() - for plan in plans: - if region in [r['id'] for r in plan['regions']]: - if 'Ubuntu 20.04 LTS (Nvidia)' in plan['os_options']: - return True - return False - - class FluidstackClient: """FluidStack API Client""" def __init__(self): self.api_key = read_contents( - os.path.expanduser(FLUIDSTACK_API_KEY_PATH)) - self.api_token = read_contents( - os.path.expanduser(FLUIDSTACK_API_TOKEN_PATH)) + os.path.expanduser(FLUIDSTACK_API_KEY_PATH)).strip() def get_plans(self): - response = requests.get(ENDPOINT + 'plans') + response = requests.get(ENDPOINT + 'list_available_configurations', + headers={'api-key': self.api_key}) raise_fluidstack_error(response) plans = response.json() - plans = [ - plan for plan in plans - if plan['minimum_commitment'] == 'hourly' and plan['type'] in - ['preconfigured', 'custom'] and plan['gpu_type'] != 'NO GPU' - ] return plans - def list_instances( - self, - tag_filters: Optional[Dict[str, - str]] = None) -> List[Dict[str, Any]]: + def list_instances(self) -> List[Dict[str, Any]]: response = requests.get( - ENDPOINT + 'servers', - auth=(self.api_key, self.api_token), + ENDPOINT + 'instances', + headers={'api-key': self.api_key}, ) raise_fluidstack_error(response) instances = response.json() - filtered_instances = [] - - for instance in instances: - if isinstance(instance['tags'], str): - instance['tags'] = json.loads(instance['tags']) - if not instance['tags']: - instance['tags'] = {} - if tag_filters: - for key in tag_filters: - if instance['tags'].get(key, None) != tag_filters[key]: - break - else: - filtered_instances.append(instance) - else: - filtered_instances.append(instance) - - return filtered_instances + return instances def create_instance( self, instance_type: str = '', - hostname: str = '', + name: str = '', region: str = '', ssh_pub_key: str = '', count: int = 1, ) -> List[str]: """Launch new instances.""" - config: Dict[str, Any] = {} plans = self.get_plans() regions = self.list_regions() + gpu_type, gpu_count = instance_type.split('::') + gpu_count = int(gpu_count) + plans = [ - plan for plan in plans if plan['plan_id'] == instance_type and - region in [r['id'] for r in plan['regions']] + plan for plan in plans if plan['gpu_type'] == gpu_type and + gpu_count in plan['gpu_counts'] and region in plan['regions'] ] if not plans: raise FluidstackAPIError( f'Plan {instance_type} out of stock in region {region}') ssh_key = self.get_or_add_ssh_key(ssh_pub_key) - os_id = 'Ubuntu 20.04 LTS' - body = dict(plan=None if config else instance_type, - region=regions[region], - os=os_id, - hostname=hostname, - ssh_keys=[ssh_key['id']], - multiplicity=count, - config=config) - - response = requests.post(ENDPOINT + 'server', - auth=(self.api_key, self.api_token), - json=body) - raise_fluidstack_error(response) - instance_ids = response.json().get('multiple') - assert all(id is not None for id in instance_ids), instance_ids + default_operating_system = 'ubuntu_22_04_lts_nvidia' + instance_ids = [] + for _ in range(count): + body = dict(gpu_type=gpu_type, + gpu_count=gpu_count, + region=regions[region], + operating_system_label=default_operating_system, + name=name, + ssh_key=ssh_key['name']) + + response = requests.post(ENDPOINT + 'instances', + headers={'api-key': self.api_key}, + json=body) + raise_fluidstack_error(response) + instance_id = response.json().get('id') + instance_ids.append(instance_id) + time.sleep(1) + return instance_ids def list_ssh_keys(self): - response = requests.get(ENDPOINT + 'ssh', - auth=(self.api_key, self.api_token)) + response = requests.get(ENDPOINT + 'ssh_keys', + headers={'api-key': self.api_key}) raise_fluidstack_error(response) return response.json() @@ -156,86 +123,50 @@ def get_or_add_ssh_key(self, ssh_pub_key: str = '') -> Dict[str, str]: """Add ssh key if not already added.""" ssh_keys = self.list_ssh_keys() for key in ssh_keys: - if key['public_key'].strip() == ssh_pub_key.strip(): - return { - 'id': key['id'], - 'name': key['name'], - 'ssh_key': ssh_pub_key - } + if key['public_key'].strip().split()[:2] == ssh_pub_key.strip( + ).split()[:2]: + return {'name': key['name'], 'ssh_key': ssh_pub_key} ssh_key_name = 'skypilot-' + get_key_suffix() response = requests.post( - ENDPOINT + 'ssh', - auth=(self.api_key, self.api_token), + ENDPOINT + 'ssh_keys', + headers={'api-key': self.api_key}, json=dict(name=ssh_key_name, public_key=ssh_pub_key), ) raise_fluidstack_error(response) - key_id = response.json()['id'] - return {'id': key_id, 'name': ssh_key_name, 'ssh_key': ssh_pub_key} + return {'name': ssh_key_name, 'ssh_key': ssh_pub_key} @functools.lru_cache() def list_regions(self): - response = requests.get(ENDPOINT + 'plans') - raise_fluidstack_error(response) - plans = response.json() - plans = [ - plan for plan in plans - if plan['minimum_commitment'] == 'hourly' and plan['type'] in - ['preconfigured', 'custom'] and plan['gpu_type'] != 'NO GPU' - ] + plans = self.get_plans() def get_regions(plans: List) -> dict: """Return a list of regions where the plan is available.""" regions = {} for plan in plans: for region in plan.get('regions', []): - regions[region['id']] = region['id'] + regions[region] = region return regions regions = get_regions(plans) return regions def delete(self, instance_id: str): - response = requests.delete(ENDPOINT + 'server/' + instance_id, - auth=(self.api_key, self.api_token)) + response = requests.delete(ENDPOINT + 'instances/' + instance_id, + headers={'api-key': self.api_key}) raise_fluidstack_error(response) return response.json() def stop(self, instance_id: str): - response = requests.put(ENDPOINT + 'server/' + instance_id + '/stop', - auth=(self.api_key, self.api_token)) - raise_fluidstack_error(response) - return response.json() - - def restart(self, instance_id: str): - response = requests.post(ENDPOINT + 'server/' + instance_id + '/reboot', - auth=(self.api_key, self.api_token)) - raise_fluidstack_error(response) - return response.json() - - def info(self, instance_id: str): - response = requests.get(ENDPOINT + f'server/{instance_id}', - auth=(self.api_key, self.api_token)) - raise_fluidstack_error(response) - return response.json() - - def status(self, instance_id: str): - response = self.info(instance_id) - return response['status'] - - def add_tags(self, instance_id: str, tags: Dict[str, str]) -> str: - response = requests.patch( - ENDPOINT + f'server/{instance_id}/tag', - auth=(self.api_key, self.api_token), - json=dict(tags=json.dumps(tags)), - ) + response = requests.put(ENDPOINT + 'instances/' + instance_id + '/stop', + headers={'api-key': self.api_key}) raise_fluidstack_error(response) return response.json() - def rename(self, instance_id: str, hostname: str) -> str: - response = requests.patch( - ENDPOINT + f'server/{instance_id}/rename', - auth=(self.api_key, self.api_token), - json=dict(name=hostname), + def rename(self, instance_id: str, name: str) -> str: + response = requests.put( + ENDPOINT + f'instances/{instance_id}/rename', + headers={'api-key': self.api_key}, + json=dict(new_instance_name=name), ) raise_fluidstack_error(response) return response.json() diff --git a/sky/provision/fluidstack/instance.py b/sky/provision/fluidstack/instance.py index e870ff15e0c..538aafc8887 100644 --- a/sky/provision/fluidstack/instance.py +++ b/sky/provision/fluidstack/instance.py @@ -27,7 +27,7 @@ def get_internal_ip(node_info: Dict[str, Any]) -> None: node_info['internal_ip'] = node_info['ip_address'] runner = command_runner.SSHCommandRunner( (node_info['ip_address'], 22), - ssh_user=node_info['capabilities']['default_user_name'], + ssh_user='ubuntu', ssh_private_key=auth.PRIVATE_SSH_KEY_PATH) result = runner.run(_GET_INTERNAL_IP_CMD, require_outputs=True, @@ -61,7 +61,7 @@ def _filter_instances( if (include_instances is not None and instance['id'] not in include_instances): continue - if instance.get('hostname') in possible_names: + if instance.get('name') in possible_names: filtered_instances[instance['id']] = instance return filtered_instances @@ -69,7 +69,7 @@ def _filter_instances( def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]: head_instance_id = None for inst_id, inst in instances.items(): - if inst['hostname'].endswith('-head'): + if inst['name'].endswith('-head'): head_instance_id = inst_id break return head_instance_id @@ -80,16 +80,7 @@ def run_instances(region: str, cluster_name_on_cloud: str, """Runs instances for the given cluster.""" pending_status = [ - 'create', - 'requesting', - 'provisioning', - 'customizing', - 'starting', - 'stopping', - 'start', - 'stop', - 'reboot', - 'rebooting', + 'pending', ] while True: instances = _filter_instances(cluster_name_on_cloud, pending_status) @@ -127,7 +118,7 @@ def rename(instance_id: str, new_name: str) -> None: f'{instance_name}') rename(instance_id, instance_name) if (instance_id != head_instance_id and - instance['hostname'].endswith('-head')): + instance['name'].endswith('-head')): # Multiple head instances exist. # This is a rare case when the instance name was manually modified # on the cloud or some unexpected behavior happened. @@ -167,7 +158,7 @@ def rename(instance_id: str, new_name: str) -> None: node_type = 'head' if head_instance_id is None else 'worker' try: instance_ids = utils.FluidstackClient().create_instance( - hostname=f'{cluster_name_on_cloud}-{node_type}', + name=f'{cluster_name_on_cloud}-{node_type}', instance_type=config.node_config['InstanceType'], ssh_pub_key=config.node_config['AuthorizedKey'], region=region) @@ -184,9 +175,6 @@ def rename(instance_id: str, new_name: str) -> None: instances = _filter_instances(cluster_name_on_cloud, pending_status + ['running']) if len(instances) < config.count: - # Some of pending instances have been convert to a state that will - # not convert to `running` status. This can be due to resource - # availability issue. all_instances = _filter_instances( cluster_name_on_cloud, status_filters=None, @@ -253,15 +241,11 @@ def terminate_instances( instances = _filter_instances(cluster_name_on_cloud, None) for inst_id, inst in instances.items(): logger.debug(f'Terminating instance {inst_id}: {inst}') - if worker_only and inst['hostname'].endswith('-head'): + if worker_only and inst['name'].endswith('-head'): continue try: utils.FluidstackClient().delete(inst_id) except Exception as e: # pylint: disable=broad-except - if (isinstance(e, utils.FluidstackAPIError) and - 'Machine is already terminated' in str(e)): - logger.debug(f'Instance {inst_id} is already terminated.') - continue with ux_utils.print_exception_no_traceback(): raise RuntimeError( f'Failed to terminate instance {inst_id}: ' @@ -291,7 +275,7 @@ def get_cluster_info( tags={}, ) ] - if instance_info['hostname'].endswith('-head'): + if instance_info['name'].endswith('-head'): head_instance_id = instance_id return common.ClusterInfo(instances=instances, @@ -311,22 +295,10 @@ def query_instances( instances = _filter_instances(cluster_name_on_cloud, None) instances = _filter_instances(cluster_name_on_cloud, None) status_map = { - 'provisioning': status_lib.ClusterStatus.INIT, - 'requesting': status_lib.ClusterStatus.INIT, - 'create': status_lib.ClusterStatus.INIT, - 'customizing': status_lib.ClusterStatus.INIT, - 'stopping': status_lib.ClusterStatus.STOPPED, - 'stop': status_lib.ClusterStatus.STOPPED, - 'start': status_lib.ClusterStatus.INIT, - 'reboot': status_lib.ClusterStatus.STOPPED, - 'rebooting': status_lib.ClusterStatus.STOPPED, + 'pending': status_lib.ClusterStatus.INIT, 'stopped': status_lib.ClusterStatus.STOPPED, - 'starting': status_lib.ClusterStatus.INIT, 'running': status_lib.ClusterStatus.UP, - 'failed to create': status_lib.ClusterStatus.INIT, - 'timeout error': status_lib.ClusterStatus.INIT, - 'out of stock': status_lib.ClusterStatus.INIT, - 'terminating': None, + 'unhealthy': status_lib.ClusterStatus.INIT, 'terminated': None, } statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {} diff --git a/sky/templates/fluidstack-ray.yml.j2 b/sky/templates/fluidstack-ray.yml.j2 index 309a5393828..3eb277ec6d9 100644 --- a/sky/templates/fluidstack-ray.yml.j2 +++ b/sky/templates/fluidstack-ray.yml.j2 @@ -65,7 +65,6 @@ setup_commands: sudo pkill -9 apt-get; sudo pkill -9 dpkg; sudo dpkg --configure -a; - {{ cuda_installation_commands }} mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} {{ ray_skypilot_installation_commands }} diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 28eeeed5190..c9b4c6e0816 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -840,6 +840,7 @@ def test_image_no_conda(): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation @pytest.mark.no_kubernetes # Kubernetes does not support stopping instances def test_custom_default_conda_env(generic_cloud: str): name = _get_cluster_name() @@ -1549,6 +1550,7 @@ def test_job_queue_multinode(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs @pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs def test_large_job_queue(generic_cloud: str): name = _get_cluster_name() @@ -1592,6 +1594,7 @@ def test_large_job_queue(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs @pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs def test_fast_large_job_queue(generic_cloud: str): # This is to test the jobs can be scheduled quickly when there are many jobs in the queue. @@ -1699,6 +1702,7 @@ def test_multi_echo(generic_cloud: str): # ---------- Task: 1 node training. ---------- +@pytest.mark.no_fluidstack # Fluidstack does not have T4 gpus for now @pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus @pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA @pytest.mark.no_scp # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead. @@ -2325,6 +2329,7 @@ def test_cancel_azure(): run_one_test(test) +@pytest.mark.no_fluidstack # Fluidstack does not support V100 gpus for now @pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus @pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA @pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi @@ -3500,6 +3505,7 @@ def test_skyserve_kubernetes_http(): run_one_test(test) +@pytest.mark.no_fluidstack # Fluidstack does not support T4 gpus for now @pytest.mark.serve def test_skyserve_llm(generic_cloud: str): """Test skyserve with real LLM usecase""" @@ -3557,6 +3563,7 @@ def test_skyserve_spot_recovery(): run_one_test(test) +@pytest.mark.no_fluidstack # Fluidstack does not support spot instances @pytest.mark.serve @pytest.mark.no_kubernetes def test_skyserve_base_ondemand_fallback(generic_cloud: str): @@ -3621,6 +3628,8 @@ def test_skyserve_dynamic_ondemand_fallback(): run_one_test(test) +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_user_bug_restart(generic_cloud: str): """Tests that we restart the service after user bug.""" @@ -3805,6 +3814,8 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str): run_one_test(test) +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_update(generic_cloud: str): """Test skyserve with update""" @@ -3833,6 +3844,10 @@ def test_skyserve_update(generic_cloud: str): run_one_test(test) +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +pytest.mark.no_fluidstack + + @pytest.mark.serve def test_skyserve_rolling_update(generic_cloud: str): """Test skyserve with rolling update""" @@ -3869,6 +3884,7 @@ def test_skyserve_rolling_update(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_fast_update(generic_cloud: str): """Test skyserve with fast update (Increment version of old replicas)""" @@ -3946,6 +3962,7 @@ def test_skyserve_update_autoscale(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # Spot instances are note supported by Fluidstack @pytest.mark.serve @pytest.mark.no_kubernetes # Spot instances are not supported in Kubernetes @pytest.mark.parametrize('mode', ['rolling', 'blue_green']) @@ -4009,6 +4026,8 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): run_one_test(test) +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_failures(generic_cloud: str): """Test replica failure statuses""" From 6c563e0b7b315107714f907e50415a73fd8f6102 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 21 Aug 2024 12:13:42 -0700 Subject: [PATCH 10/12] [Test] minor fix for a typo in fluidstack test (#3855) minor fix for a typo --- tests/test_smoke.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index c9b4c6e0816..4d2c26103a7 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -3845,9 +3845,7 @@ def test_skyserve_update(generic_cloud: str): # TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs -pytest.mark.no_fluidstack - - +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_rolling_update(generic_cloud: str): """Test skyserve with rolling update""" From 40749e36e89c0b68420095083dacce7f3f56f774 Mon Sep 17 00:00:00 2001 From: Colin Campbell Date: Thu, 22 Aug 2024 00:16:04 -0400 Subject: [PATCH 11/12] Allow smarter device manager to schedule on all nodes (#3857) --- .../kubernetes/manifests/smarter-device-manager-daemonset.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml b/sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml index 664fd69a8c8..2f8abf00550 100644 --- a/sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +++ b/sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml @@ -26,6 +26,9 @@ spec: hostname: smarter-device-management hostNetwork: true dnsPolicy: ClusterFirstWithHostNet + tolerations: + - effect: NoSchedule + operator: Exists containers: - name: smarter-device-manager image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/smarter-device-manager:v1.1.2 From 1cd244401bfd21702f5a97878f013c060f5d9115 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 22 Aug 2024 00:21:32 -0700 Subject: [PATCH 12/12] [AWS] Support targeted on-demand capacity reservations (#3852) * wip Allow prioritize reservations format Allow open capacity reservations Add check reserved resources format Remove specific reservations * parent fcf1f60e2903edf8af7cdddf98189dc0c79e34d1 author Zhanghao Wu 1724175607 +0000 committer Zhanghao Wu 1724210666 +0000 wip Allow prioritize reservations format Add check reserved resources format * Support target capacity reservation provisioning * Fix comments * Add doc * format --- docs/source/reference/config.rst | 36 ++++++++++++++- sky/clouds/aws.py | 42 ++++++++++++++++- sky/clouds/cloud.py | 5 ++ sky/clouds/gcp.py | 4 ++ sky/clouds/utils/aws_utils.py | 57 +++++++++++++++++++++++ sky/optimizer.py | 35 ++++++++++++-- sky/provision/aws/instance.py | 79 ++++++++++++++++++++++++++++---- sky/templates/aws-ray.yml.j2 | 6 +++ sky/utils/schemas.py | 9 ++++ 9 files changed, 257 insertions(+), 16 deletions(-) create mode 100644 sky/clouds/utils/aws_utils.py diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index fc5eddd6a47..2669aef0a1b 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -193,6 +193,35 @@ Available fields and semantics: # Default: false. disk_encrypted: false + # Reserved capacity (optional). + # + # Whether to prioritize capacity reservations (considered as 0 cost) in the + # optimizer. + # + # If you have capacity reservations in your AWS project: + # Setting this to true guarantees the optimizer will pick any matching + # reservation within all regions and AWS will auto consume your reservations + # with instance match criteria to "open", and setting to false means + # optimizer uses regular, non-zero pricing in optimization (if by chance any + # matching reservation exists, AWS will still consume the reservation). + # + # Note: this setting is default to false for performance reasons, as it can + # take half a minute to retrieve the reservations from AWS when set to true. + # + # Default: false. + prioritize_reservations: false + # + # The targeted capacity reservations (CapacityReservationId) to be + # considered when provisioning clusters on AWS. SkyPilot will automatically + # prioritize this reserved capacity (considered as zero cost) if the + # requested resources matches the reservation. + # + # Ref: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/capacity-reservations-launch.html + specific_reservations: + - cr-a1234567 + - cr-b2345678 + + # Identity to use for AWS instances (optional). # # LOCAL_CREDENTIALS: The user's local credential files will be uploaded to @@ -307,13 +336,16 @@ Available fields and semantics: # Setting this to true guarantees the optimizer will pick any matching # reservation and GCP will auto consume your reservation, and setting to # false means optimizer uses regular, non-zero pricing in optimization (if - # by chance any matching reservation is selected, GCP still auto consumes - # the reservation). + # by chance any matching reservation exists, GCP still auto consumes the + # reservation). # # If you have "specifically targeted" reservations (set by the # `specific_reservations` field below): This field will automatically be set # to true. # + # Note: this setting is default to false for performance reasons, as it can + # take half a minute to retrieve the reservations from GCP when set to true. + # # Default: false. prioritize_reservations: false # diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 021f243da70..3a05223574d 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -8,7 +8,7 @@ import subprocess import time import typing -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple from sky import clouds from sky import exceptions @@ -17,6 +17,7 @@ from sky import skypilot_config from sky.adaptors import aws from sky.clouds import service_catalog +from sky.clouds.utils import aws_utils from sky.skylet import constants from sky.utils import common_utils from sky.utils import resources_utils @@ -173,6 +174,10 @@ def regions_with_offering(cls, instance_type: str, regions = [r for r in regions if r.zones] return regions + @classmethod + def optimize_by_zone(cls) -> bool: + return aws_utils.use_reservations() + @classmethod def zones_provision_loop( cls, @@ -197,11 +202,13 @@ def zones_provision_loop( zone=None) for r in regions: assert r.zones is not None, r - if num_nodes > 1: + if num_nodes > 1 or aws_utils.use_reservations(): # When num_nodes > 1, we shouldn't pass a list of zones to the # AWS NodeProvider to try, because it may then place the nodes of # the same cluster in different zones. This is an artifact of the # current AWS NodeProvider implementation. + # Also, when using reservations, they are zone-specific, so we + # should return one zone at a time. for z in r.zones: yield [z] else: @@ -856,6 +863,37 @@ def check_quota_available(cls, # Quota found to be greater than zero, try provisioning return True + def get_reservations_available_resources( + self, + instance_type: str, + region: str, + zone: Optional[str], + specific_reservations: Set[str], + ) -> Dict[str, int]: + if zone is None: + # For backward compatibility, the cluster in INIT state launched + # before #2352 may not have zone information. In this case, we + # return 0 for all reservations. + return {reservation: 0 for reservation in specific_reservations} + reservations = aws_utils.list_reservations_for_instance_type( + instance_type, region) + + filtered_reservations = [] + for r in reservations: + if zone != r.zone: + continue + if r.targeted: + if r.name in specific_reservations: + filtered_reservations.append(r) + else: + filtered_reservations.append(r) + reservation_available_resources = { + r.name: r.available_resources for r in filtered_reservations + } + logger.debug('Get AWS reservations available resources:' + f'{region}-{zone}: {reservation_available_resources}') + return reservation_available_resources + @classmethod def query_status(cls, name: str, tag_filters: Dict[str, str], region: Optional[str], zone: Optional[str], diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 854cb467c5f..9775109ac80 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -177,6 +177,11 @@ def regions_with_offering(cls, instance_type: str, """ raise NotImplementedError + @classmethod + def optimize_by_zone(cls) -> bool: + """Returns whether to optimize this cloud by zone (default: region).""" + return False + @classmethod def zones_provision_loop( cls, diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index e24e67b2486..643d55d7037 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -260,6 +260,10 @@ def regions_with_offering(cls, instance_type: str, regions = [r for r in regions if r.zones] return regions + @classmethod + def optimize_by_zone(cls) -> bool: + return True + @classmethod def zones_provision_loop( cls, diff --git a/sky/clouds/utils/aws_utils.py b/sky/clouds/utils/aws_utils.py new file mode 100644 index 00000000000..a6c21a15d5c --- /dev/null +++ b/sky/clouds/utils/aws_utils.py @@ -0,0 +1,57 @@ +"""Utilities for AWS.""" +import dataclasses +import time +from typing import List + +import cachetools + +from sky import skypilot_config +from sky.adaptors import aws + + +@dataclasses.dataclass +class AWSReservation: + name: str + instance_type: str + zone: str + available_resources: int + # Whether the reservation is targeted, i.e. can only be consumed when + # the reservation name is specified. + targeted: bool + + +def use_reservations() -> bool: + prioritize_reservations = skypilot_config.get_nested( + ('aws', 'prioritize_reservations'), False) + specific_reservations = skypilot_config.get_nested( + ('aws', 'specific_reservations'), set()) + return prioritize_reservations or specific_reservations + + +@cachetools.cached(cache=cachetools.TTLCache(maxsize=100, + ttl=300, + timer=time.time)) +def list_reservations_for_instance_type( + instance_type: str, + region: str, +) -> List[AWSReservation]: + if not use_reservations(): + return [] + ec2 = aws.client('ec2', region_name=region) + response = ec2.describe_capacity_reservations(Filters=[{ + 'Name': 'instance-type', + 'Values': [instance_type] + }, { + 'Name': 'state', + 'Values': ['active'] + }]) + reservations = response['CapacityReservations'] + return [ + AWSReservation( + name=r['CapacityReservationId'], + instance_type=r['InstanceType'], + zone=r['AvailabilityZone'], + available_resources=r['AvailableInstanceCount'], + targeted=r['InstanceMatchCriteria'] == 'targeted', + ) for r in reservations + ] diff --git a/sky/optimizer.py b/sky/optimizer.py index 7b4b29e3bce..10aa697258b 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -19,6 +19,8 @@ from sky.adaptors import common as adaptors_common from sky.utils import env_options from sky.utils import log_utils +from sky.utils import rich_utils +from sky.utils import subprocess_utils from sky.utils import ux_utils if typing.TYPE_CHECKING: @@ -252,6 +254,26 @@ def _estimate_nodes_cost_or_time( # node -> cloud -> list of resources that satisfy user's requirements. node_to_candidate_map: _TaskToPerCloudCandidates = {} + def get_available_reservations( + launchable_resources: Dict[resources_lib.Resources, + List[resources_lib.Resources]] + ) -> Dict[resources_lib.Resources, int]: + num_available_reserved_nodes_per_resource = {} + + def get_reservations_available_resources( + resources: resources_lib.Resources): + num_available_reserved_nodes_per_resource[resources] = sum( + resources.get_reservations_available_resources().values()) + + launchable_resources_list: List[resources_lib.Resources] = sum( + launchable_resources.values(), []) + with rich_utils.safe_status( + '[cyan]Checking reserved resources...[/]'): + subprocess_utils.run_in_parallel( + get_reservations_available_resources, + launchable_resources_list) + return num_available_reserved_nodes_per_resource + # Compute the estimated cost/time for each node. for node_i, node in enumerate(topo_order): if node_i == 0: @@ -279,7 +301,11 @@ def _estimate_nodes_cost_or_time( list(node.resources)[0]: list(node.resources) } + # Fetch reservations in advance and in parallel to speed up the + # reservation info fetching. num_resources = len(list(node.resources)) + num_available_reserved_nodes_per_resource = ( + get_available_reservations(launchable_resources)) for orig_resources, launchable_list in launchable_resources.items(): if num_resources == 1 and node.time_estimator_func is None: @@ -302,15 +328,16 @@ def _estimate_nodes_cost_or_time( else: estimated_runtime = node.estimate_runtime( orig_resources) + for resources in launchable_list: if do_print: logger.debug(f'resources: {resources}') if minimize_cost: cost_per_node = resources.get_cost(estimated_runtime) - num_available_reserved_nodes = sum( - resources.get_reservations_available_resources( - ).values()) + num_available_reserved_nodes = ( + num_available_reserved_nodes_per_resource[resources] + ) # We consider the cost of the unused reservation # resources to be 0 since we are already paying for @@ -1116,7 +1143,7 @@ def _make_launchables_for_valid_region_zones( regions = launchable_resources.get_valid_regions_for_launchable() for region in regions: if (launchable_resources.use_spot and region.zones is not None or - isinstance(launchable_resources.cloud, clouds.GCP)): + launchable_resources.cloud.optimize_by_zone()): # Spot instances. # Do not batch the per-zone requests. for zone in region.zones: diff --git a/sky/provision/aws/instance.py b/sky/provision/aws/instance.py index 0161992bffc..24173482f34 100644 --- a/sky/provision/aws/instance.py +++ b/sky/provision/aws/instance.py @@ -15,6 +15,7 @@ from sky import status_lib from sky.adaptors import aws from sky.clouds import aws as aws_cloud +from sky.clouds.utils import aws_utils from sky.provision import common from sky.provision import constants from sky.provision.aws import utils @@ -429,19 +430,81 @@ def _create_node_tag(target_instance, is_head: bool = True) -> str: head_instance_id = _create_node_tag(resumed_instances[0]) if to_start_count > 0: + target_reservations = (config.node_config.get( + 'CapacityReservationSpecification', + {}).get('CapacityReservationTarget', + {}).get('CapacityReservationId', [])) + created_instances = [] + if target_reservations: + node_config = copy.deepcopy(config.node_config) + # Clear the capacity reservation specification settings in the + # original node config, as we will create instances with + # reservations with specific settings for each reservation. + node_config['CapacityReservationSpecification'] = { + 'CapacityReservationTarget': {} + } + + reservations = aws_utils.list_reservations_for_instance_type( + node_config['InstanceType'], region=region) + # Filter the reservations by the user-specified ones, because + # reservations contain 'open' reservations as well, which do not + # need to explicitly specify in the config for creating instances. + target_reservations_to_count = {} + for reservation in reservations: + if (reservation.targeted and + reservation.name in target_reservations): + target_reservations_to_count[ + reservation.name] = reservation.available_resources + + target_reservations_list = sorted( + target_reservations_to_count.items(), + key=lambda x: x[1], + reverse=True) + for reservation, reservation_count in target_reservations_list: + if reservation_count <= 0: + # We have sorted the reservations by the available + # resources, so if the reservation is not available, the + # following reservations are not available either. + break + reservation_count = min(reservation_count, to_start_count) + logger.debug(f'Creating {reservation_count} instances ' + f'with reservation {reservation}') + node_config['CapacityReservationSpecification'][ + 'CapacityReservationTarget'] = { + 'CapacityReservationId': reservation + } + created_reserved_instances = _create_instances( + ec2_fail_fast, + cluster_name_on_cloud, + node_config, + tags, + reservation_count, + associate_public_ip_address=( + not config.provider_config['use_internal_ips'])) + created_instances.extend(created_reserved_instances) + to_start_count -= reservation_count + if to_start_count <= 0: + break + # TODO(suquark): If there are existing instances (already running or # resumed), then we cannot guarantee that they will be in the same # availability zone (when there are multiple zones specified). # This is a known issue before. - created_instances = _create_instances( - ec2_fail_fast, - cluster_name_on_cloud, - config.node_config, - tags, - to_start_count, - associate_public_ip_address=( - not config.provider_config['use_internal_ips'])) + if to_start_count > 0: + # Remove the capacity reservation specification from the node config + # as we have already created the instances with the reservations. + config.node_config.get('CapacityReservationSpecification', + {}).pop('CapacityReservationTarget', None) + created_remaining_instances = _create_instances( + ec2_fail_fast, + cluster_name_on_cloud, + config.node_config, + tags, + to_start_count, + associate_public_ip_address=( + not config.provider_config['use_internal_ips'])) + created_instances.extend(created_remaining_instances) created_instances.sort(key=lambda x: x.id) created_instance_ids = [n.id for n in created_instances] diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 4cfecfe2b12..7e9dfccdaf1 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -84,6 +84,12 @@ available_node_types: # SpotOptions: # MaxPrice: MAX_HOURLY_PRICE {% endif %} + CapacityReservationSpecification: + CapacityReservationPreference: open + {% if specific_reservations %} + CapacityReservationTarget: + CapacityReservationId: {{specific_reservations}} + {% endif %} # Use cloud init in UserData to set up the authorized_keys to get # around the number of keys limit and permission issues with # ec2.describe_key_pairs. diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 0fa1e8d34ce..01dc14f617c 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -706,6 +706,15 @@ def get_config_schema(): 'required': [], 'additionalProperties': False, 'properties': { + 'prioritize_reservations': { + 'type': 'boolean', + }, + 'specific_reservations': { + 'type': 'array', + 'items': { + 'type': 'string', + }, + }, 'disk_encrypted': { 'type': 'boolean', },