From 7dfcba101e7bd9f21b6bd1f3ff78234d1387e375 Mon Sep 17 00:00:00 2001 From: Sheth Date: Fri, 16 Feb 2024 23:04:10 -0800 Subject: [PATCH 01/42] initial commit --- examples/unsloth/train.py | 61 +++++++++++++++++++++++++++++++++++ examples/unsloth/unsloth.yaml | 14 ++++++++ 2 files changed, 75 insertions(+) create mode 100644 examples/unsloth/train.py create mode 100644 examples/unsloth/unsloth.yaml diff --git a/examples/unsloth/train.py b/examples/unsloth/train.py new file mode 100644 index 00000000000..530980ba392 --- /dev/null +++ b/examples/unsloth/train.py @@ -0,0 +1,61 @@ +from unsloth import FastLanguageModel +import torch +from trl import SFTTrainer +from transformers import TrainingArguments +from datasets import load_dataset +max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any! +# Get LAION dataset +url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl" +dataset = load_dataset("json", data_files = {"train" : url}, split = "train") + +# 4bit pre quantized models we support - 4x faster downloading! +fourbit_models = [ + "unsloth/mistral-7b-bnb-4bit", + "unsloth/llama-2-7b-bnb-4bit", + "unsloth/llama-2-13b-bnb-4bit", + "unsloth/codellama-34b-bnb-4bit", + "unsloth/tinyllama-bnb-4bit", +] +# Load Llama model +model, tokenizer = FastLanguageModel.from_pretrained( + model_name = "unsloth/mistral-7b-bnb-4bit", # Supports Llama, Mistral - replace this! + max_seq_length = max_seq_length, + dtype = None, + load_in_4bit = True, +) + +# Do model patching and add fast LoRA weights +model = FastLanguageModel.get_peft_model( + model, + r = 16, + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj",], + lora_alpha = 16, + lora_dropout = 0, # Supports any, but = 0 is optimized + bias = "none", # Supports any, but = "none" is optimized + use_gradient_checkpointing = True, + random_state = 3407, + max_seq_length = max_seq_length, +) + +trainer = SFTTrainer( + model = model, + train_dataset = dataset, + dataset_text_field = "text", + max_seq_length = max_seq_length, + tokenizer = tokenizer, + args = TrainingArguments( + per_device_train_batch_size = 2, + gradient_accumulation_steps = 4, + warmup_steps = 10, + max_steps = 60, + fp16 = not torch.cuda.is_bf16_supported(), + bf16 = torch.cuda.is_bf16_supported(), + logging_steps = 1, + output_dir = "outputs", + optim = "adamw_8bit", + seed = 3407, + ), +) +trainer.train() + diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml new file mode 100644 index 00000000000..9f66955fa6f --- /dev/null +++ b/examples/unsloth/unsloth.yaml @@ -0,0 +1,14 @@ +resources: + accelerators: T4:1 + disk_size: 128 + +workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth + +setup: | + set -ex + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton --index-url https://download.pytorch.org/whl/cu118 + pip install ipython + pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" + +run: | + python train.py \ No newline at end of file From a08c0344f641eadbd7b91634f2b4575e972de405 Mon Sep 17 00:00:00 2001 From: Sheth Date: Fri, 16 Feb 2024 23:05:58 -0800 Subject: [PATCH 02/42] newline --- examples/unsloth/unsloth.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 9f66955fa6f..825e0dd8aa1 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -11,4 +11,5 @@ setup: | pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" run: | - python train.py \ No newline at end of file + python train.py + \ No newline at end of file From b85cbf92c10d51249b4e5f0697ee4e468fab0769 Mon Sep 17 00:00:00 2001 From: Sheth Date: Fri, 16 Feb 2024 23:17:10 -0800 Subject: [PATCH 03/42] comments --- examples/unsloth/{train.py => unsloth.py} | 20 ++++++++++++-------- examples/unsloth/unsloth.yaml | 16 +++++++++++++--- 2 files changed, 25 insertions(+), 11 deletions(-) rename examples/unsloth/{train.py => unsloth.py} (77%) diff --git a/examples/unsloth/train.py b/examples/unsloth/unsloth.py similarity index 77% rename from examples/unsloth/train.py rename to examples/unsloth/unsloth.py index 530980ba392..8d5f8071c68 100644 --- a/examples/unsloth/train.py +++ b/examples/unsloth/unsloth.py @@ -1,14 +1,16 @@ +# Use the unsloth library to fine-tune a Mistral model + from unsloth import FastLanguageModel import torch from trl import SFTTrainer from transformers import TrainingArguments from datasets import load_dataset -max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any! -# Get LAION dataset +max_seq_length = 2048 + +# [1] Get LAION dataset url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl" dataset = load_dataset("json", data_files = {"train" : url}, split = "train") -# 4bit pre quantized models we support - 4x faster downloading! fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", "unsloth/llama-2-7b-bnb-4bit", @@ -16,28 +18,30 @@ "unsloth/codellama-34b-bnb-4bit", "unsloth/tinyllama-bnb-4bit", ] -# Load Llama model + +# [2] Load Mistral model model, tokenizer = FastLanguageModel.from_pretrained( - model_name = "unsloth/mistral-7b-bnb-4bit", # Supports Llama, Mistral - replace this! + model_name = "unsloth/mistral-7b-bnb-4bit", max_seq_length = max_seq_length, dtype = None, load_in_4bit = True, ) -# Do model patching and add fast LoRA weights +# [3] Do model patching and add fast LoRA weights model = FastLanguageModel.get_peft_model( model, r = 16, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, - lora_dropout = 0, # Supports any, but = 0 is optimized - bias = "none", # Supports any, but = "none" is optimized + lora_dropout = 0, + bias = "none", use_gradient_checkpointing = True, random_state = 3407, max_seq_length = max_seq_length, ) +# [4] Initialize and train the model using the SFTTrainer trainer = SFTTrainer( model = model, train_dataset = dataset, diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 825e0dd8aa1..44ad3963ecb 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -1,8 +1,19 @@ +# Runs the unsloth example app via SkyPilot +# +# The example app starts by obtaining the LAION dataset, +# loads the Mistral model with 4-bit precision, performs model +# patching with fast LoRA weights, and finally initializes and +# trains the model using the SFTTrainer with specified hyperparameters +# and the LAION dataset. +# +# Usage: +# sky launch -c myclus unsloth.yaml + resources: accelerators: T4:1 disk_size: 128 -workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth +workdir: . setup: | set -ex @@ -11,5 +22,4 @@ setup: | pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" run: | - python train.py - \ No newline at end of file + python unsloth.py From 4fbfe1713f78e9acb0a8f083c09ffa18855bb14b Mon Sep 17 00:00:00 2001 From: Sheth Date: Fri, 16 Feb 2024 23:26:34 -0800 Subject: [PATCH 04/42] run linter --- examples/unsloth/unsloth.py | 7 ++++--- examples/unsloth/unsloth.yaml | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/unsloth/unsloth.py b/examples/unsloth/unsloth.py index 8d5f8071c68..ee9ab2b2f05 100644 --- a/examples/unsloth/unsloth.py +++ b/examples/unsloth/unsloth.py @@ -1,10 +1,11 @@ # Use the unsloth library to fine-tune a Mistral model -from unsloth import FastLanguageModel +from datasets import load_dataset import torch -from trl import SFTTrainer from transformers import TrainingArguments -from datasets import load_dataset +from trl import SFTTrainer +from unsloth import FastLanguageModel + max_seq_length = 2048 # [1] Get LAION dataset diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 44ad3963ecb..b900f0d0752 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -3,8 +3,8 @@ # The example app starts by obtaining the LAION dataset, # loads the Mistral model with 4-bit precision, performs model # patching with fast LoRA weights, and finally initializes and -# trains the model using the SFTTrainer with specified hyperparameters -# and the LAION dataset. +# trains the model using the SFTTrainer with specified +# hyperparameters and the LAION dataset. # # Usage: # sky launch -c myclus unsloth.yaml @@ -17,7 +17,8 @@ workdir: . setup: | set -ex - pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton --index-url https://download.pytorch.org/whl/cu118 + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ + --index-url https://download.pytorch.org/whl/cu118 pip install ipython pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" From 6fc77e12809bf67cf6d60e77bd694fa6e1f0581f Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 18 Feb 2024 00:47:45 -0800 Subject: [PATCH 05/42] reminder for down --- examples/unsloth/unsloth.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index b900f0d0752..7fac6770df2 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -8,6 +8,7 @@ # # Usage: # sky launch -c myclus unsloth.yaml +# sky down myclus resources: accelerators: T4:1 From d6cb99316b4ce23fc09915e378105d8148888581 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 18 Feb 2024 02:42:30 -0800 Subject: [PATCH 06/42] tentatively done with example --- examples/unsloth/unsloth.yaml | 6 +++++- examples/unsloth/{unsloth.py => unsloth_example.py} | 11 +++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) rename examples/unsloth/{unsloth.py => unsloth_example.py} (84%) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 7fac6770df2..4c01a5ca220 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -14,6 +14,10 @@ resources: accelerators: T4:1 disk_size: 128 +file_mounts: + /outputs: + name: my-unsloth-checkpoints + workdir: . setup: | @@ -24,4 +28,4 @@ setup: | pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" run: | - python unsloth.py + python unsloth_example.py --output-dir /outputs diff --git a/examples/unsloth/unsloth.py b/examples/unsloth/unsloth_example.py similarity index 84% rename from examples/unsloth/unsloth.py rename to examples/unsloth/unsloth_example.py index ee9ab2b2f05..95ead037859 100644 --- a/examples/unsloth/unsloth.py +++ b/examples/unsloth/unsloth_example.py @@ -1,5 +1,6 @@ # Use the unsloth library to fine-tune a Mistral model +import argparse from datasets import load_dataset import torch from transformers import TrainingArguments @@ -42,7 +43,12 @@ max_seq_length = max_seq_length, ) -# [4] Initialize and train the model using the SFTTrainer +# [4] Parse output directory of checkpoints +parser = argparse.ArgumentParser() +parser.add_argument("--output-dir", type=str, default="/outputs") +args = parser.parse_args() + +# [5] Initialize and train the model using the SFTTrainer trainer = SFTTrainer( model = model, train_dataset = dataset, @@ -57,9 +63,10 @@ fp16 = not torch.cuda.is_bf16_supported(), bf16 = torch.cuda.is_bf16_supported(), logging_steps = 1, - output_dir = "outputs", + output_dir = args.output_dir[1:], optim = "adamw_8bit", seed = 3407, + save_steps = 10, ), ) trainer.train() From 2d5aceb894a2cf7223f5443444699581bfe4aad0 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 18 Feb 2024 02:53:11 -0800 Subject: [PATCH 07/42] formatting --- examples/unsloth/unsloth_example.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 95ead037859..52c0a306c67 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -1,6 +1,7 @@ # Use the unsloth library to fine-tune a Mistral model import argparse + from datasets import load_dataset import torch from transformers import TrainingArguments From 4e1954a0027dbdea3375d3afd73177d36a18f2eb Mon Sep 17 00:00:00 2001 From: Sheth Date: Mon, 19 Feb 2024 01:11:30 -0800 Subject: [PATCH 08/42] yapf --- examples/unsloth/unsloth_example.py | 70 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 52c0a306c67..400f2c8402b 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -12,7 +12,7 @@ # [1] Get LAION dataset url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl" -dataset = load_dataset("json", data_files = {"train" : url}, split = "train") +dataset = load_dataset("json", data_files={"train": url}, split="train") fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", @@ -24,24 +24,31 @@ # [2] Load Mistral model model, tokenizer = FastLanguageModel.from_pretrained( - model_name = "unsloth/mistral-7b-bnb-4bit", - max_seq_length = max_seq_length, - dtype = None, - load_in_4bit = True, + model_name="unsloth/mistral-7b-bnb-4bit", + max_seq_length=max_seq_length, + dtype=None, + load_in_4bit=True, ) # [3] Do model patching and add fast LoRA weights model = FastLanguageModel.get_peft_model( model, - r = 16, - target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj",], - lora_alpha = 16, - lora_dropout = 0, - bias = "none", - use_gradient_checkpointing = True, - random_state = 3407, - max_seq_length = max_seq_length, + r=16, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + ], + lora_alpha=16, + lora_dropout=0, + bias="none", + use_gradient_checkpointing=True, + random_state=3407, + max_seq_length=max_seq_length, ) # [4] Parse output directory of checkpoints @@ -51,24 +58,23 @@ # [5] Initialize and train the model using the SFTTrainer trainer = SFTTrainer( - model = model, - train_dataset = dataset, - dataset_text_field = "text", - max_seq_length = max_seq_length, - tokenizer = tokenizer, - args = TrainingArguments( - per_device_train_batch_size = 2, - gradient_accumulation_steps = 4, - warmup_steps = 10, - max_steps = 60, - fp16 = not torch.cuda.is_bf16_supported(), - bf16 = torch.cuda.is_bf16_supported(), - logging_steps = 1, - output_dir = args.output_dir[1:], - optim = "adamw_8bit", - seed = 3407, - save_steps = 10, + model=model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=max_seq_length, + tokenizer=tokenizer, + args=TrainingArguments( + per_device_train_batch_size=2, + gradient_accumulation_steps=4, + warmup_steps=10, + max_steps=60, + fp16=not torch.cuda.is_bf16_supported(), + bf16=torch.cuda.is_bf16_supported(), + logging_steps=1, + output_dir=args.output_dir[1:], + optim="adamw_8bit", + seed=3407, + save_steps=10, ), ) trainer.train() - From 27a89050cf929f4f9a54d3078c44e2e229a3f14a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 22 Feb 2024 14:55:15 -0800 Subject: [PATCH 09/42] [Storage] Storage mounting tool permissions fix (#3215) * fix permissions * fix permissions --- sky/data/mounting_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 32904ada517..2f4e37a1b66 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -19,7 +19,7 @@ def get_s3_mount_install_cmd() -> str: install_cmd = ('sudo wget -nc https://github.com/romilbhardwaj/goofys/' 'releases/download/0.24.0-romilb-upstream/goofys ' '-O /usr/local/bin/goofys && ' - 'sudo chmod +x /usr/local/bin/goofys') + 'sudo chmod 755 /usr/local/bin/goofys') return install_cmd From 41a63df344d3b3cea0ae837d7391f3c1e86bb5da Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 22 Feb 2024 17:43:06 -0800 Subject: [PATCH 10/42] [LLM] Example for Serving Gemma (#3207) * Add serve for gemma and fix mixtral dependency * Add hf token * fix model len * Add comment * Serve your private gemma * fix serve yaml * readme * Remove chat completion due to the wrong template * add readme * Update llm/gemma/README.md Co-authored-by: Zongheng Yang * address comments * Update README.md Co-authored-by: Zongheng Yang * Update llm/gemma/README.md Co-authored-by: Zongheng Yang * Update llm/gemma/README.md Co-authored-by: Zongheng Yang * Update llm/gemma/README.md Co-authored-by: Zongheng Yang * Change to it * Add chat API * use HF_TOKEN env * typo --------- Co-authored-by: Zongheng Yang --- README.md | 2 + docs/source/index.rst | 1 + llm/gemma/README.md | 103 ++++++++++++++++++++++++++++++++++++++++++ llm/gemma/serve.yaml | 47 +++++++++++++++++++ 4 files changed, 153 insertions(+) create mode 100644 llm/gemma/README.md create mode 100644 llm/gemma/serve.yaml diff --git a/README.md b/README.md index 2c03d5afa06..606ac06e2f0 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ ---- :fire: *News* :fire: +- [Feb, 2024] Deploying and scaling [**Gemma**](https://blog.google/technology/developers/gemma-open-models/) with SkyServe: [**example**](./llm/gemma/) - [Feb, 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/) - [Feb, 2024] Serving [**Code Llama 70B**](https://ai.meta.com/blog/code-llama-large-language-model-coding/) with vLLM and SkyServe: [**example**](./llm/codellama/) - [Dec, 2023] Using [**LoRAX**](https://github.com/predibase/lorax) to serve 1000s of finetuned LLMs on a single instance in the cloud: [**example**](./llm/lorax/) @@ -148,6 +149,7 @@ To learn more, see our [Documentation](https://skypilot.readthedocs.io/en/latest Runnable examples: - LLMs on SkyPilot + - [Gemma](./llm/gemma/) - [Mixtral 8x7B](./llm/mixtral/); [Mistral 7B](https://docs.mistral.ai/self-deployment/skypilot/) (from official Mistral team) - [Code Llama](./llm/codellama/) - [vLLM: Serving LLM 24x Faster On the Cloud](./llm/vllm/) (from official vLLM team) diff --git a/docs/source/index.rst b/docs/source/index.rst index 493c7459a9a..fbf03b3f552 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -69,6 +69,7 @@ Runnable examples: * **LLMs on SkyPilot** + * `Gemma `_ * `Mixtral 8x7B `_; `Mistral 7B `_ (from official Mistral team) * `Code Llama `_ * `vLLM: Serving LLM 24x Faster On the Cloud `_ (from official vLLM team) diff --git a/llm/gemma/README.md b/llm/gemma/README.md new file mode 100644 index 00000000000..d0ff0114ff8 --- /dev/null +++ b/llm/gemma/README.md @@ -0,0 +1,103 @@ +# Serve Your Gemma on Any Cloud + +Google released [Gemma](https://blog.google/technology/developers/gemma-open-models/) and has made a big wave in the AI community. +It opens the opportunity for the open-source community to serve and finetune private Gemini. + +## Serve Gemma on any Cloud + +Serving Gemma on any cloud is easy with SkyPilot. With [serve.yaml](serve.yaml) in this directory, you host the model on any cloud with a single command. + +### Prerequsites + +1. Apply for access to the Gemma model + +Go to the [application page](https://huggingface.co/google/gemma-7b) and click **Acknowledge license** to apply for access to the model weights. + + +2. Get the access token from huggingface + +Generate a read-only access token on huggingface [here](https://huggingface.co/settings/token), and make sure your huggingface account can access the Gemma models [here](https://huggingface.co/google/gemma-7b). + +3. Install SkyPilot + +```bash +pip install "skypilot-nightly[all]" +``` +For detailed installation instructions, please refer to the [installation guide](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). + +### Host on a Single Instance + +We can host the model with a single instance: +```bash +HF_TOKEN="xxx" sky launch -c gemma serve.yaml --env HF_TOKEN +``` + +After the cluster is launched, we can access the model with the following command: +```bash +IP=$(sky status --ip gemma) + +curl -L http://$IP:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/gemma-7b-it", + "prompt": "My favourite condiment is", + "max_tokens": 25 + }' | jq . +``` + +Chat API is also supported: +```bash +IP=$(sky status --ip gemma) + +curl -L http://$IP:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/gemma-7b-it", + "messages": [ + { + "role": "user", + "content": "Hello! What is your name?" + } + ], + "max_tokens": 25 + }' +``` + +### Scale the Serving with SkyServe + + +Using the same YAML, we can easily scale the model serving across multiple instances, regions and clouds with SkyServe: +```bash +HF_TOKEN="xxx" sky serve up -n gemma serve.yaml --env HF_TOKEN +``` + +> Notice the only change is from `sky launch` to `sky serve up`. The same YAML can be used without changes. + +After the cluster is launched, we can access the model with the following command: +```bash +ENDPOINT=$(sky serve status --endpoint gemma) + +curl -L http://$ENDPOINT/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/gemma-7b-it", + "prompt": "My favourite condiment is", + "max_tokens": 25 + }' | jq . +``` + +Chat API is also supported: +```bash +curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "google/gemma-7b-it", + "messages": [ + { + "role": "user", + "content": "Hello! What is your name?" + } + ], + "max_tokens": 25 + }' +``` diff --git a/llm/gemma/serve.yaml b/llm/gemma/serve.yaml new file mode 100644 index 00000000000..a477554d47a --- /dev/null +++ b/llm/gemma/serve.yaml @@ -0,0 +1,47 @@ +# A example yaml for serving Gemma model from Mistral.ai with an OpenAI API. +# Usage: +# 1. Launch on a single instance: `sky launch -c gemma ./serve.yaml` +# 2. Scale up to multiple instances with a single endpoint: +# `sky serve up -n gemma ./serve.yaml` +service: + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + initial_delay_seconds: 1200 + replicas: 2 + +envs: + MODEL_NAME: google/gemma-7b-it + HF_TOKEN: # TODO: Replace with huggingface token + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + ports: 8000 + disk_tier: best + +setup: | + conda activate gemma + if [ $? -ne 0 ]; then + conda create -n gemma -y python=3.10 + conda activate gemma + fi + pip install vllm==0.3.2 + pip install transformers==4.38.0 + python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + +run: | + conda activate gemma + export PATH=$PATH:/sbin + # --max-model-len is set to 1024 to avoid taking too much GPU memory on L4 and + # A10g with small memory. + python -u -m vllm.entrypoints.openai.api_server \ + --host 0.0.0.0 \ + --model $MODEL_NAME \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 1024 | tee ~/openai_api_server.log + From 2b17e91d93f9d54e9a0b7f44e837dba9d0c1f837 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 23 Feb 2024 13:18:59 -0800 Subject: [PATCH 11/42] [LLM] Add logo for Gemma (#3220) --- llm/gemma/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/llm/gemma/README.md b/llm/gemma/README.md index d0ff0114ff8..676afce6606 100644 --- a/llm/gemma/README.md +++ b/llm/gemma/README.md @@ -1,4 +1,5 @@ # Serve Your Gemma on Any Cloud +![image](https://github.com/skypilot-org/skypilot/assets/6753189/e452c39e-b5ef-4cb2-ab48-053f9e6f67b7) Google released [Gemma](https://blog.google/technology/developers/gemma-open-models/) and has made a big wave in the AI community. It opens the opportunity for the open-source community to serve and finetune private Gemini. From b326d12610acbdae15e155ca12c0c6e2ef800004 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 23 Feb 2024 13:22:42 -0800 Subject: [PATCH 12/42] Minor fixes for release 0.5.0 (#3212) * when removing cudo credential, sky check fails * remove tips * minor hint fix * fix cluster version for k8s * fix typo --- docs/source/examples/auto-failover.rst | 6 ------ sky/clouds/cudo.py | 9 ++++++++- tests/backward_compatibility_tests.sh | 2 +- tests/kubernetes/README.md | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/source/examples/auto-failover.rst b/docs/source/examples/auto-failover.rst index c8f77c533a1..bbfc3cb469b 100644 --- a/docs/source/examples/auto-failover.rst +++ b/docs/source/examples/auto-failover.rst @@ -108,12 +108,6 @@ AWS, where it succeeded after two regions: Multiple Candidate GPUs ------------------------- -.. tip:: - - Support for multiple resources via ``any_of`` or ``ordered`` was added after v0.4.1. - - To use this feature, :ref:`install the nightly release `: ``pip install -U skypilot-nightly`` - If a task can be run on different GPUs, the user can specify multiple candidate GPUs, and SkyPilot will automatically find the cheapest available GPU. diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 855bdaf59ae..ad7a22e6e03 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -276,7 +276,14 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: # pylint: disable=import-outside-toplevel,unused-import from cudo_compute import cudo_api from cudo_compute.rest import ApiException - _, error = cudo_api.client() + try: + _, error = cudo_api.client() + except FileNotFoundError as e: + return False, ( + 'Cudo credentials are not set. ' + f'{cls._CREDENTIAL_HINT}\n' + f'{cls._INDENT_PREFIX}' + f'{common_utils.format_exception(e, use_bracket=True)}') if error is not None: return False, ( diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index 9fd7586e22b..47381294afe 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -112,7 +112,7 @@ sky logs ${CLUSTER_NAME}-4 2 fi # (1 node) sky start + sky exec + sky queue + sky logs -if [ "$start_form" -le 5 ]; then +if [ "$start_from" -le 5 ]; then conda activate sky-back-compat-master rm -r ~/.sky/wheels || true sky launch --cloud ${CLOUD} -y --cpus 2 -c ${CLUSTER_NAME}-5 examples/minimal.yaml diff --git a/tests/kubernetes/README.md b/tests/kubernetes/README.md index 220f96f6a9e..4a882352703 100644 --- a/tests/kubernetes/README.md +++ b/tests/kubernetes/README.md @@ -32,7 +32,7 @@ sky local up ```bash PROJECT_ID=$(gcloud config get-value project) CLUSTER_NAME=testclusterromil - gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.27.3-gke.100" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" + gcloud beta container --project "${PROJECT_ID}" clusters create "${CLUSTER_NAME}" --zone "us-central1-c" --no-enable-basic-auth --cluster-version "1.29.0-gke.1381000" --release-channel "regular" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-t4,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/${PROJECT_ID}/global/networks/default" --subnetwork "projects/${PROJECT_ID}/regions/us-central1/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "110" --security-posture=standard --workload-vulnerability-scanning=disabled --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "v100" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-8" --accelerator "type=nvidia-tesla-v100,count=1" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" && gcloud beta container --project "${PROJECT_ID}" node-pools create "largecpu" --cluster "${CLUSTER_NAME}" --zone "us-central1-c" --machine-type "n1-standard-16" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" --num-nodes "2" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations "us-central1-c" ``` 2. Get the kubeconfig for your cluster and place it in `~/.kube/config`: ```bash From 6d778726c0ef70da6e9cb614be774006f4fcd075 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 23 Feb 2024 19:18:39 -0800 Subject: [PATCH 13/42] [Docker] Add retry for docker pull due to daemon not ready (#3218) * Add retry for docker pull due to daemon not ready * longer wait time * longer wait time * retry earlier * add retry for retries as well * longer wait time * change wait time * format * Add comment * Fix * Fix indent for azure docker config * Fix docker login config * Fix comments * More robust docker login config * Add retry for docker check * minor fix * Add additional test for stop and start with docker * Fix cancelled --- sky/backends/backend_utils.py | 2 +- sky/backends/cloud_vm_ray_backend.py | 2 + sky/provision/docker_utils.py | 58 ++++++++++++++------ sky/skylet/providers/command_runner.py | 74 ++++++++++++++++++++------ sky/templates/aws-ray.yml.j2 | 9 ---- sky/templates/azure-ray.yml.j2 | 38 ++++++------- sky/templates/gcp-ray.yml.j2 | 18 +++---- tests/test_smoke.py | 8 +++ 8 files changed, 139 insertions(+), 70 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index ad98e2b4e0e..596d0bec043 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -136,7 +136,7 @@ # should take the latest security group name. _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [ ('provider', 'availability_zone'), - # AWS with new provisioner has docker_login_config in the + # Clouds with new provisioner has docker_login_config in the # docker field, instead of the provider field. ('docker', 'docker_login_config'), # Other clouds diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index fc5d3d34b56..12f8bd8ac28 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1837,6 +1837,8 @@ def need_ray_up( logger.info( 'Retrying launching in {:.1f} seconds.'.format(sleep)) time.sleep(sleep) + # TODO(zhwu): when we retry ray up, it is possible that the ray + # cluster fail to start because --no-restart flag is used. ray_up_return_value = ray_up() assert ray_up_return_value is not None diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index 2df14ce39fd..303032128e3 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -2,6 +2,7 @@ import dataclasses import shlex +import time import typing from typing import Any, Dict, List @@ -14,6 +15,9 @@ logger = sky_logging.init_logger(__name__) +DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ' + 'the Docker daemon socket') + @dataclasses.dataclass class DockerLoginConfig: @@ -120,7 +124,11 @@ def __init__(self, docker_config: Dict[str, Any], self.docker_cmd = 'podman' if use_podman else 'docker' self.log_path = log_path - def _run(self, cmd, run_env='host') -> str: + def _run(self, + cmd, + run_env='host', + wait_for_docker_daemon: bool = False) -> str: + if run_env == 'docker': cmd = self._docker_expand_user(cmd, any_char=True) cmd = ' '.join(_with_interactive(cmd)) @@ -132,10 +140,24 @@ def _run(self, cmd, run_env='host') -> str: f' {shlex.quote(cmd)} ') logger.debug(f'+ {cmd}') - rc, stdout, stderr = self.runner.run(cmd, - require_outputs=True, - stream_logs=False, - log_path=self.log_path) + cnt = 0 + retry = 3 + while True: + rc, stdout, stderr = self.runner.run(cmd, + require_outputs=True, + stream_logs=False, + log_path=self.log_path) + if (not wait_for_docker_daemon or + DOCKER_PERMISSION_DENIED_STR not in stdout + stderr): + break + + cnt += 1 + if cnt > retry: + break + logger.info( + 'Failed to run docker command, retrying in 10 seconds... ' + f'({cnt}/{retry})') + time.sleep(10) subprocess_utils.handle_returncode( rc, cmd, @@ -164,10 +186,12 @@ def initialize(self) -> str: # TODO(tian): Maybe support a command to get the login password? docker_login_config = DockerLoginConfig( **self.docker_config['docker_login_config']) - self._run(f'{self.docker_cmd} login --username ' - f'{docker_login_config.username} ' - f'--password {docker_login_config.password} ' - f'{docker_login_config.server}') + self._run( + f'{self.docker_cmd} login --username ' + f'{docker_login_config.username} ' + f'--password {docker_login_config.password} ' + f'{docker_login_config.server}', + wait_for_docker_daemon=True) # We automatically add the server prefix to the image name if # the user did not add it. server_prefix = f'{docker_login_config.server}/' @@ -177,11 +201,14 @@ def initialize(self) -> str: if self.docker_config.get('pull_before_run', True): assert specific_image, ('Image must be included in config if ' + 'pull_before_run is specified') - self._run(f'{self.docker_cmd} pull {specific_image}') + self._run(f'{self.docker_cmd} pull {specific_image}', + wait_for_docker_daemon=True) else: - self._run(f'{self.docker_cmd} image inspect {specific_image} ' - '1> /dev/null 2>&1 || ' - f'{self.docker_cmd} pull {specific_image}') + self._run( + f'{self.docker_cmd} image inspect {specific_image} ' + '1> /dev/null 2>&1 || ' + f'{self.docker_cmd} pull {specific_image}', + wait_for_docker_daemon=True) logger.info(f'Starting container {self.container_name} with image ' f'{specific_image}') @@ -347,7 +374,8 @@ def _auto_configure_shm(self, run_options: List[str]) -> List[str]: def _check_container_exited(self) -> bool: if self.initialized: return True - output = (self._run( - check_docker_running_cmd(self.container_name, self.docker_cmd))) + output = (self._run(check_docker_running_cmd(self.container_name, + self.docker_cmd), + wait_for_docker_daemon=True)) return 'false' in output.lower( ) and 'no such object' not in output.lower() diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py index 83abe476151..b6ea52c6eeb 100644 --- a/sky/skylet/providers/command_runner.py +++ b/sky/skylet/providers/command_runner.py @@ -1,8 +1,10 @@ """Sky's DockerCommandRunner.""" import json import os +import time from typing import Dict +import click from ray.autoscaler._private.cli_logger import cli_logger from ray.autoscaler._private.command_runner import DockerCommandRunner from ray.autoscaler._private.docker import check_docker_running_cmd @@ -81,16 +83,53 @@ class SkyDockerCommandRunner(DockerCommandRunner): `ray.autoscaler._private.command_runner.DockerCommandRunner`. """ + def _run_with_retry(self, cmd, **kwargs): + """Run a command with retries for docker.""" + cnt = 0 + max_retry = 3 + while True: + try: + return self.run(cmd, **kwargs) + except click.ClickException as e: + # We retry the command if it fails, because docker commands can + # fail due to the docker daemon not being ready yet. + # Ray command runner raise ClickException when the command + # fails. + cnt += 1 + if cnt >= max_retry: + raise e + cli_logger.warning( + f'Failed to run command {cmd!r}. ' + f'Retrying in 10 seconds. Retry count: {cnt}') + time.sleep(10) + # SkyPilot: New function to check whether a container is exited # (but not removed). This is due to previous `sky stop` command, # which will stop the container but not remove it. def _check_container_exited(self) -> bool: if self.initialized: return True - output = (self.ssh_command_runner.run( - check_docker_running_cmd(self.container_name, self.docker_cmd), - with_output=True, - ).decode('utf-8').strip()) + cnt = 0 + max_retry = 3 + cmd = check_docker_running_cmd(self.container_name, self.docker_cmd) + # We manually retry the command based on the output, as the command will + # not fail even if the docker daemon is not ready, due to the underlying + # usage of `|| true` in the command. + while True: + output = (self.run(cmd, with_output=True, + run_env='host').decode('utf-8').strip()) + if docker_utils.DOCKER_PERMISSION_DENIED_STR in output: + cnt += 1 + if cnt >= max_retry: + raise click.ClickException( + f'Failed to run command {cmd!r}. ' + f'Retry count: {cnt}. Output: {output}') + cli_logger.warning( + f'Failed to run command {cmd!r}. ' + f'Retrying in 10 seconds. Retry count: {cnt}') + time.sleep(10) + else: + break return 'false' in output.lower( ) and 'no such object' not in output.lower() @@ -110,6 +149,9 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], # If true, then we can start the container directly. # Notice that we will skip all setup commands, so we need to # manually start the ssh service. + # We also add retries when checking the container status to make sure + # the docker daemon is ready, as it may not be ready immediately after + # the VM is started. if self._check_container_exited(): self.initialized = True self.run(f'docker start {self.container_name}', run_env='host') @@ -121,12 +163,10 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], # TODO(tian): Maybe support a command to get the login password? docker_login_config: docker_utils.DockerLoginConfig = self.docker_config[ "docker_login_config"] - self.run('{} login --username {} --password {} {}'.format( - self.docker_cmd, - docker_login_config.username, - docker_login_config.password, - docker_login_config.server, - )) + self._run_with_retry( + f'{self.docker_cmd} login --username ' + f'{docker_login_config.username} --password ' + f'{docker_login_config.password} {docker_login_config.server}') # We automatically add the server prefix to the image name if # the user did not add it. server_prefix = f'{docker_login_config.server}/' @@ -134,15 +174,15 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], specific_image = f'{server_prefix}{specific_image}' if self.docker_config.get('pull_before_run', True): - assert specific_image, ('Image must be included in config if ' + + assert specific_image, ('Image must be included in config if ' 'pull_before_run is specified') - self.run('{} pull {}'.format(self.docker_cmd, specific_image), - run_env='host') + self._run_with_retry(f'{self.docker_cmd} pull {specific_image}', + run_env='host') else: - - self.run(f'{self.docker_cmd} image inspect {specific_image} ' - '1> /dev/null 2>&1 || ' - f'{self.docker_cmd} pull {specific_image}') + self._run_with_retry( + f'{self.docker_cmd} image inspect {specific_image} ' + '1> /dev/null 2>&1 || ' + f'{self.docker_cmd} pull {specific_image}') # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index fd56448791c..e834ee1d0c8 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -40,15 +40,6 @@ provider: # NOTE: This is a new field added by SkyPilot to force use a specific VPC. vpc_name: {{vpc_name}} {% endif %} -{%- if docker_login_config is not none %} - # We put docker login config in provider section because ray's schema disabled - # additionalProperties for docker config. - # See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227 - docker_login_config: - username: {{docker_login_config.username}} - password: {{docker_login_config.password}} - server: {{docker_login_config.server}} -{%- endif %} use_internal_ips: {{use_internal_ips}} # Disable launch config check for worker nodes as it can cause resource # leakage. diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 369e7a52ec6..9ffe2a7958e 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -17,14 +17,14 @@ docker: {%- endif %} provider: - type: external - module: sky.skylet.providers.azure.AzureNodeProvider - location: {{region}} - # Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87 - # For Azure, ray distinguishes different instances by the resource_group, - # instead of the cluster_name. This ensures that ray creates new instances - # for different cluster_name. - resource_group: {{resource_group}} + type: external + module: sky.skylet.providers.azure.AzureNodeProvider + location: {{region}} + # Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87 + # For Azure, ray distinguishes different instances by the resource_group, + # instead of the cluster_name. This ensures that ray creates new instances + # for different cluster_name. + resource_group: {{resource_group}} {%- if docker_login_config is not none %} # We put docker login config in provider section because ray's schema disabled # additionalProperties for docker config. @@ -34,17 +34,17 @@ provider: password: {{docker_login_config.password}} server: {{docker_login_config.server}} {%- endif %} - # Keep (otherwise cannot reuse when re-provisioning). - # teardown(terminate=True) will override this. - cache_stopped_nodes: True - # subscription id of the azure user - subscription_id: {{azure_subscription_id}} - # Disable launch config check for worker nodes as it can cause resource - # leakage. - # Reference: https://github.com/ray-project/ray/blob/cd1ba65e239360c8a7b130f991ed414eccc063ce/python/ray/autoscaler/_private/autoscaler.py#L1115 - # The upper-level SkyPilot code has make sure there will not be resource - # leakage. - disable_launch_config_check: true + # Keep (otherwise cannot reuse when re-provisioning). + # teardown(terminate=True) will override this. + cache_stopped_nodes: True + # subscription id of the azure user + subscription_id: {{azure_subscription_id}} + # Disable launch config check for worker nodes as it can cause resource + # leakage. + # Reference: https://github.com/ray-project/ray/blob/cd1ba65e239360c8a7b130f991ed414eccc063ce/python/ray/autoscaler/_private/autoscaler.py#L1115 + # The upper-level SkyPilot code has make sure there will not be resource + # leakage. + disable_launch_config_check: true auth: diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 76c818b8aef..7a92c4d4429 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -15,6 +15,15 @@ docker: {%- if gpu is not none %} --gpus all {%- endif %} +{%- if docker_login_config is not none %} + docker_login_config: + username: |- + {{docker_login_config.username}} + password: |- + {{docker_login_config.password}} + server: |- + {{docker_login_config.server}} +{%- endif %} {%- endif %} provider: @@ -37,15 +46,6 @@ provider: {% if firewall_rule is not none %} firewall_rule: {{firewall_rule}} {% endif %} -{%- if docker_login_config is not none %} - # We put docker login config in provider section because ray's schema disabled - # additionalProperties for docker config. - # See: https://github.com/ray-project/ray/blob/d2fc4823126927b2c54f89ec72fa3d24b442e6a3/python/ray/autoscaler/ray-schema.json#L227 - docker_login_config: - username: {{docker_login_config.username}} - password: {{docker_login_config.password}} - server: {{docker_login_config.server}} -{%- endif %} use_internal_ips: {{use_internal_ips}} {%- if tpu_vm %} _has_tpus: True diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 6206f50a8df..25a7764686b 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1100,6 +1100,14 @@ def test_job_queue_with_docker(generic_cloud: str): 'sleep 5', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep RUNNING', f'sky cancel -y {name} 3', + f'sky stop -y {name}', + # Make sure the job status preserve after stop and start the + # cluster. This is also a test for the docker container to be + # preserved after stop and start. + f'sky start -y {name}', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-1 | grep FAILED', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-2 | grep CANCELLED', + f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep CANCELLED', f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', f'sky exec {name} --gpus T4:1 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', f'sky logs {name} 4 --status', From cb695d53d99f78be96d1f690f8b4097c5e309750 Mon Sep 17 00:00:00 2001 From: Sheth Date: Mon, 26 Feb 2024 13:56:24 -0800 Subject: [PATCH 14/42] added comments --- examples/unsloth/unsloth.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 4c01a5ca220..548871753aa 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -15,10 +15,11 @@ resources: disk_size: 128 file_mounts: + # Creates a new bucket my-unsloth-checkpoints and mounts it at /outputs /outputs: - name: my-unsloth-checkpoints + name: my-unsloth-checkpoints # Ensure this name is unique -workdir: . +workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth setup: | set -ex From 888f1a8b39abedef37361a66f42c353fe94b9dc0 Mon Sep 17 00:00:00 2001 From: Sheth Date: Mon, 26 Feb 2024 21:58:29 -0800 Subject: [PATCH 15/42] quick fix --- examples/unsloth/unsloth_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 400f2c8402b..255dfb560cf 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -71,7 +71,7 @@ fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=1, - output_dir=args.output_dir[1:], + output_dir=("~" + args.output_dir), optim="adamw_8bit", seed=3407, save_steps=10, From 48776693be2f25a25a38c6cc8de8df2044c8826f Mon Sep 17 00:00:00 2001 From: Sheth Date: Wed, 28 Feb 2024 18:39:49 -0800 Subject: [PATCH 16/42] finished pip issues --- examples/unsloth/unsloth.yaml | 18 ++++++++++++++---- examples/unsloth/unsloth_example.py | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index 548871753aa..e0e07d95c94 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -23,10 +23,20 @@ workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth setup: | set -ex - pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ - --index-url https://download.pytorch.org/whl/cu118 - pip install ipython - pip install "unsloth[cu118_torch220] @ git+https://github.com/unslothai/unsloth.git" + pip install --upgrade pip + cuda_version=$(nvcc --version | grep "release" | awk '{print $6}' | cut -c 2-) + + if [[ "$cuda_version" == "12.1"* ]]; then + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ + --index-url https://download.pytorch.org/whl/cu121 + pip install ipython + pip install "unsloth[cu121-torch220] @ git+https://github.com/unslothai/unsloth.git" + else + pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \ + --index-url https://download.pytorch.org/whl/cu118 + pip install ipython + pip install "unsloth[cu118-torch220] @ git+https://github.com/unslothai/unsloth.git" + fi run: | python unsloth_example.py --output-dir /outputs diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 255dfb560cf..9f0895096d0 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -71,7 +71,7 @@ fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=1, - output_dir=("~" + args.output_dir), + output_dir=(args.output_dir), optim="adamw_8bit", seed=3407, save_steps=10, From 7a208dd9c91aceaff871efb8cefd61ac4510725a Mon Sep 17 00:00:00 2001 From: Sheth Date: Wed, 28 Feb 2024 18:43:25 -0800 Subject: [PATCH 17/42] fix --- examples/unsloth/unsloth.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/unsloth/unsloth.yaml b/examples/unsloth/unsloth.yaml index e0e07d95c94..2c163b42121 100644 --- a/examples/unsloth/unsloth.yaml +++ b/examples/unsloth/unsloth.yaml @@ -19,7 +19,7 @@ file_mounts: /outputs: name: my-unsloth-checkpoints # Ensure this name is unique -workdir: /Users/hriday/sky-unsloth/skypilot/examples/unsloth +workdir: . setup: | set -ex From 152e36a6a40417fb1be3fcaf0bdd9b5eb671eefa Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 29 Feb 2024 11:00:16 -0800 Subject: [PATCH 18/42] fix storage error message, add example link to docs --- README.md | 2 +- docs/source/index.rst | 2 +- examples/unsloth/unsloth_example.py | 2 +- sky/data/storage.py | 10 +++++++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a1af8c63f7d..93abecb0fa4 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ Runnable examples: - [LocalGPT](./llm/localgpt) - [Falcon](./llm/falcon) - Add yours here & see more in [`llm/`](./llm)! -- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), and [many more (`examples/`)](./examples). +- Framework examples: [PyTorch DDP](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_torch.yaml), [DeepSpeed](./examples/deepspeed-multinode/sky.yaml), [JAX/Flax on TPU](https://github.com/skypilot-org/skypilot/blob/master/examples/tpu/tpuvm_mnist.yaml), [Stable Diffusion](https://github.com/skypilot-org/skypilot/tree/master/examples/stable_diffusion), [Detectron2](https://github.com/skypilot-org/skypilot/blob/master/examples/detectron2_docker.yaml), [Distributed](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_distributed_tf_app.py) [TensorFlow](https://github.com/skypilot-org/skypilot/blob/master/examples/resnet_app_storage.yaml), [Ray Train](examples/distributed_ray_train/ray_train.yaml), [NeMo](https://github.com/skypilot-org/skypilot/blob/master/examples/nemo/nemo.yaml), [programmatic grid search](https://github.com/skypilot-org/skypilot/blob/master/examples/huggingface_glue_imdb_grid_search_app.py), [Docker](https://github.com/skypilot-org/skypilot/blob/master/examples/docker/echo_app.yaml), [Unsloth](https://github.com/skypilot-org/skypilot/blob/master/examples/unsloth/unsloth.yaml) and [many more (`examples/`)](./examples). Follow updates: - [Twitter](https://twitter.com/skypilot_org) diff --git a/docs/source/index.rst b/docs/source/index.rst index 5c0f8a7f7c5..8e140c33ed9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,7 +85,7 @@ Runnable examples: * `Falcon `_ * Add yours here & see more in `llm/ `_! -* Framework examples: `PyTorch DDP `_, `DeepSpeed `_, `JAX/Flax on TPU `_, `Stable Diffusion `_, `Detectron2 `_, `Distributed `_ `TensorFlow `_, `NeMo `_, `programmatic grid search `_, `Docker `_, and `many more `_. +* Framework examples: `PyTorch DDP `_, `DeepSpeed `_, `JAX/Flax on TPU `_, `Stable Diffusion `_, `Detectron2 `_, `Distributed `_ `TensorFlow `_, `NeMo `_, `programmatic grid search `_, `Docker `_, `Unsloth `_ and `many more `_. Follow updates: diff --git a/examples/unsloth/unsloth_example.py b/examples/unsloth/unsloth_example.py index 9f0895096d0..4e488ad82f0 100644 --- a/examples/unsloth/unsloth_example.py +++ b/examples/unsloth/unsloth_example.py @@ -71,7 +71,7 @@ fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=1, - output_dir=(args.output_dir), + output_dir=args.output_dir, optim="adamw_8bit", seed=3407, save_steps=10, diff --git a/sky/data/storage.py b/sky/data/storage.py index 4ca8441be3a..9356a50f365 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -333,14 +333,18 @@ def _validate_existing_bucket(self): # bucket's URL as 'source'. if handle is None: with ux_utils.print_exception_no_traceback(): + store_prefix = get_store_prefix(StoreType.from_store(self)) raise exceptions.StorageSpecError( 'Attempted to mount a non-sky managed bucket ' f'{self.name!r} without specifying the storage source.' - ' To mount an externally created bucket (e.g., ' + f' Bucket {self.name!r} already exists. \n' + ' • To create a new bucket, specify a unique name.\n' + ' • To mount an externally created bucket (e.g., ' 'created through cloud console or cloud cli), ' 'specify the bucket URL in the source field ' - 'instead of its name. E.g., replace `name: external-' - 'bucket` with `source: gs://external-bucket`.') + 'instead of its name. I.e., replace ' + f'`name: {self.name}` with ' + f'`source: {store_prefix}{self.name}`.') class Storage(object): From c86849490ebc453b095ccb6713dd52306341522e Mon Sep 17 00:00:00 2001 From: Sheth Date: Thu, 18 Apr 2024 15:53:13 -0700 Subject: [PATCH 19/42] prototype for gcs --- hriday/cloud-storage.py | 1 + hriday/cloud-storage.yaml | 9 ++++++++ sky/backends/cloud_vm_ray_backend.py | 8 +++++-- sky/data/data_utils.py | 15 +++++++++--- sky/data/mounting_utils.py | 34 ++++++++++++++++++++++++---- sky/data/storage.py | 18 +++++++++++++++ sky/task.py | 3 ++- 7 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 hriday/cloud-storage.py create mode 100644 hriday/cloud-storage.yaml diff --git a/hriday/cloud-storage.py b/hriday/cloud-storage.py new file mode 100644 index 00000000000..292e00d620b --- /dev/null +++ b/hriday/cloud-storage.py @@ -0,0 +1 @@ +a = 2 + 2 diff --git a/hriday/cloud-storage.yaml b/hriday/cloud-storage.yaml new file mode 100644 index 00000000000..517204154c2 --- /dev/null +++ b/hriday/cloud-storage.yaml @@ -0,0 +1,9 @@ +file_mounts: + /my_data: + source: gs://hriday-test/ + mode: RCLONE + store: GCS + +workdir: /Users/hriday/sky-unsloth/skypilot/hriday + +run: python3 cloud-storage.py diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 44ade8c9c5e..48f241313ff 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4388,7 +4388,8 @@ def _execute_storage_mounts( storage_mounts = { path: storage_mount for path, storage_mount in storage_mounts.items() - if storage_mount.mode == storage_lib.StorageMode.MOUNT + if (storage_mount.mode == storage_lib.StorageMode.MOUNT or + storage_mount.mode == storage_lib.StorageMode.RCLONE) } # Handle cases when there aren't any Storages with MOUNT mode. @@ -4424,7 +4425,10 @@ def _execute_storage_mounts( 'successfully without mounting the bucket.') # Get the first store and use it to mount store = list(storage_obj.stores.values())[0] - mount_cmd = store.mount_command(dst) + if storage_obj.mode == storage_lib.StorageMode.MOUNT: + mount_cmd = store.mount_command(dst) + else: + mount_cmd = store.mount_command_rclone(dst) src_print = (storage_obj.source if storage_obj.source else storage_obj.name) if isinstance(src_print, list): diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 21717ec739a..564eecac68b 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -12,6 +12,7 @@ from filelock import FileLock +from sky import clouds from sky import exceptions from sky import sky_logging from sky.adaptors import aws @@ -402,6 +403,7 @@ class Rclone(): # to their respective profile prefix class RcloneClouds(Enum): IBM = 'sky-ibm-' + GCP = 'sky-gcp' @staticmethod def generate_rclone_bucket_profile_name(bucket_name: str, @@ -422,10 +424,10 @@ def generate_rclone_bucket_profile_name(bucket_name: str, @staticmethod def get_rclone_config(bucket_name: str, cloud: RcloneClouds, - region: str) -> str: - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, cloud) + region: Optional[str]) -> str: if cloud is Rclone.RcloneClouds.IBM: + bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( + bucket_name, cloud) access_key_id, secret_access_key = ibm.get_hmac_keys() config_data = textwrap.dedent(f"""\ [{bucket_rclone_profile}] @@ -438,6 +440,13 @@ def get_rclone_config(bucket_name: str, cloud: RcloneClouds, location_constraint = {region}-smart acl = private """) + elif cloud is Rclone.RcloneClouds.GCP: + config_data = textwrap.dedent(f"""\ + [{Rclone.RcloneClouds.GCP}] + type = google cloud storage + project_number = {clouds.GCP.get_project_id()} + bucket_acl = private + """) else: with ux_utils.print_exception_no_traceback(): raise NotImplementedError('No rclone configuration builder was ' diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 2f4e37a1b66..bb6076fb681 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -12,7 +12,9 @@ _RENAME_DIR_LIMIT = 10000 # https://github.com/GoogleCloudPlatform/gcsfuse/releases GCSFUSE_VERSION = '1.3.0' - +RCLONE_INSTALL_COMMAND = ('rclone version >/dev/null 2>&1 || ' + '(curl https://rclone.org/install.sh | ' + 'sudo bash)') def get_s3_mount_install_cmd() -> str: """Returns a command to install S3 mount utility goofys.""" @@ -53,6 +55,31 @@ def get_gcs_mount_cmd(bucket_name: str, mount_path: str) -> str: f'{bucket_name} {mount_path}') return mount_cmd +def get_gcs_mount_install_cmd_rclone() -> str: + return RCLONE_INSTALL_COMMAND + +def get_gcs_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, + bucket_rclone_profile: str, bucket_name: str, + mount_path: str) -> str: + """Returns a command to mount an GCP GCS bucket using rclone.""" + # creates a fusermount soft link on older (<22) Ubuntu systems for + # rclone's mount utility. + set_fuser3_soft_link = ('[ ! -f /bin/fusermount3 ] && ' + 'sudo ln -s /bin/fusermount /bin/fusermount3 || ' + 'true') + # stores bucket profile in rclone config file at the cluster's nodes. + configure_rclone_profile = (f'{set_fuser3_soft_link}; ' + 'mkdir -p ~/.config/rclone/ && ' + f'echo "{rclone_config_data}" >> ' + f'{rclone_config_path}') + # --daemon will keep the mounting process running in the background. + mount_cmd = (f'{configure_rclone_profile} && ' + 'rclone mount ' + f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' + '--daemon --daemon --daemon-wait 0 \ + --allow-other --rc --vfs-cache-mode full &&' #todo: figure out if this should be a semicolon or an && + 'rclone rc vfs/refresh') + return mount_cmd def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str, endpoint_url: str, bucket_name: str, @@ -69,10 +96,7 @@ def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str, def get_cos_mount_install_cmd() -> str: """Returns a command to install IBM COS mount utility rclone.""" - install_cmd = ('rclone version >/dev/null 2>&1 || ' - '(curl https://rclone.org/install.sh | ' - 'sudo bash)') - return install_cmd + return RCLONE_INSTALL_COMMAND def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, diff --git a/sky/data/storage.py b/sky/data/storage.py index 06cdcbca62c..e782552abbd 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -140,6 +140,7 @@ def from_store(cls, store: 'AbstractStore') -> 'StoreType': class StorageMode(enum.Enum): MOUNT = 'MOUNT' COPY = 'COPY' + RCLONE = 'RCLONE' def get_storetype_from_cloud(cloud: clouds.Cloud) -> StoreType: @@ -1808,6 +1809,23 @@ def mount_command(self, mount_path: str) -> str: f'gcsfuse --version | grep -q {mounting_utils.GCSFUSE_VERSION}') return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd, version_check_cmd) + + + def mount_command_rclone(self, mount_path: str) -> str: + install_cmd = mounting_utils.get_gcs_mount_install_cmd_rclone() + rclone_config_data = Rclone.get_rclone_config( + self.bucket.name, + Rclone.RcloneClouds.GCP, + None + ) + mount_cmd = mounting_utils.get_gcs_mount_cmd_rclone(rclone_config_data, + Rclone.RCLONE_CONFIG_PATH, + Rclone.RcloneClouds.GCP, + self.bucket.name, + mount_path) + return mounting_utils.get_mounting_command(mount_path, install_cmd, + mount_cmd) + def _download_file(self, remote_path: str, local_path: str) -> None: """Downloads file from remote to local on GS bucket diff --git a/sky/task.py b/sky/task.py index ad3519f92e4..ae8da4baf72 100644 --- a/sky/task.py +++ b/sky/task.py @@ -1083,7 +1083,8 @@ def get_required_cloud_features( # Storage mounting for _, storage_mount in self.storage_mounts.items(): - if storage_mount.mode == storage_lib.StorageMode.MOUNT: + if (storage_mount.mode == storage_lib.StorageMode.MOUNT or + storage_mount.mode == storage_lib.StorageMode.RCLONE): required_features.add( clouds.CloudImplementationFeatures.STORAGE_MOUNTING) break From 953b4a61413cad15562022036801888012ce272e Mon Sep 17 00:00:00 2001 From: Sheth Date: Thu, 18 Apr 2024 15:55:24 -0700 Subject: [PATCH 20/42] removed non git files --- hriday/cloud-storage.py | 1 - hriday/cloud-storage.yaml | 9 --------- 2 files changed, 10 deletions(-) delete mode 100644 hriday/cloud-storage.py delete mode 100644 hriday/cloud-storage.yaml diff --git a/hriday/cloud-storage.py b/hriday/cloud-storage.py deleted file mode 100644 index 292e00d620b..00000000000 --- a/hriday/cloud-storage.py +++ /dev/null @@ -1 +0,0 @@ -a = 2 + 2 diff --git a/hriday/cloud-storage.yaml b/hriday/cloud-storage.yaml deleted file mode 100644 index 517204154c2..00000000000 --- a/hriday/cloud-storage.yaml +++ /dev/null @@ -1,9 +0,0 @@ -file_mounts: - /my_data: - source: gs://hriday-test/ - mode: RCLONE - store: GCS - -workdir: /Users/hriday/sky-unsloth/skypilot/hriday - -run: python3 cloud-storage.py From e63e943ec8d0a77d198add972ec861316124e1a8 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 21 Apr 2024 02:47:50 -0700 Subject: [PATCH 21/42] rclone support for aws --- sky/data/data_utils.py | 16 ++++++++--- sky/data/mounting_utils.py | 55 +++++++++++++++++++------------------- sky/data/storage.py | 30 ++++++++++++++++++--- 3 files changed, 66 insertions(+), 35 deletions(-) diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 564eecac68b..46851d81eff 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -404,6 +404,7 @@ class Rclone(): class RcloneClouds(Enum): IBM = 'sky-ibm-' GCP = 'sky-gcp' + AWS = 'sky-aws' @staticmethod def generate_rclone_bucket_profile_name(bucket_name: str, @@ -425,9 +426,9 @@ def generate_rclone_bucket_profile_name(bucket_name: str, @staticmethod def get_rclone_config(bucket_name: str, cloud: RcloneClouds, region: Optional[str]) -> str: - if cloud is Rclone.RcloneClouds.IBM: - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( + bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( bucket_name, cloud) + if cloud is Rclone.RcloneClouds.IBM: access_key_id, secret_access_key = ibm.get_hmac_keys() config_data = textwrap.dedent(f"""\ [{bucket_rclone_profile}] @@ -442,11 +443,20 @@ def get_rclone_config(bucket_name: str, cloud: RcloneClouds, """) elif cloud is Rclone.RcloneClouds.GCP: config_data = textwrap.dedent(f"""\ - [{Rclone.RcloneClouds.GCP}] + [{bucket_rclone_profile}] type = google cloud storage project_number = {clouds.GCP.get_project_id()} bucket_acl = private """) + elif cloud is Rclone.RcloneClouds.AWS: + config_data = textwrap.dedent(f"""\ + [{bucket_rclone_profile}] + type = s3 + provider = AWS + access_key_id = {aws.session().get_credentials().get_frozen_credentials().access_key} + secret_access_key = {aws.session().get_credentials().get_frozen_credentials().secret_key} + acl = private + """) else: with ux_utils.print_exception_no_traceback(): raise NotImplementedError('No rclone configuration builder was ' diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index bb6076fb681..24347f18908 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -55,13 +55,24 @@ def get_gcs_mount_cmd(bucket_name: str, mount_path: str) -> str: f'{bucket_name} {mount_path}') return mount_cmd -def get_gcs_mount_install_cmd_rclone() -> str: - return RCLONE_INSTALL_COMMAND -def get_gcs_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, - bucket_rclone_profile: str, bucket_name: str, - mount_path: str) -> str: - """Returns a command to mount an GCP GCS bucket using rclone.""" +def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str, + endpoint_url: str, bucket_name: str, + mount_path: str) -> str: + """Returns a command to install R2 mount utility goofys.""" + mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} ' + f'AWS_PROFILE={r2_profile_name} goofys -o allow_other ' + f'--stat-cache-ttl {_STAT_CACHE_TTL} ' + f'--type-cache-ttl {_TYPE_CACHE_TTL} ' + f'--endpoint {endpoint_url} ' + f'{bucket_name} {mount_path}') + return mount_cmd + + +def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, + bucket_rclone_profile: str, bucket_name: str, + mount_path: str) -> str: + """Returns a command to mount an IBM COS bucket using rclone.""" # creates a fusermount soft link on older (<22) Ubuntu systems for # rclone's mount utility. set_fuser3_soft_link = ('[ ! -f /bin/fusermount3 ] && ' @@ -76,33 +87,19 @@ def get_gcs_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, mount_cmd = (f'{configure_rclone_profile} && ' 'rclone mount ' f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' - '--daemon --daemon --daemon-wait 0 \ - --allow-other --rc --vfs-cache-mode full &&' #todo: figure out if this should be a semicolon or an && - 'rclone rc vfs/refresh') - return mount_cmd - -def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str, - endpoint_url: str, bucket_name: str, - mount_path: str) -> str: - """Returns a command to install R2 mount utility goofys.""" - mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} ' - f'AWS_PROFILE={r2_profile_name} goofys -o allow_other ' - f'--stat-cache-ttl {_STAT_CACHE_TTL} ' - f'--type-cache-ttl {_TYPE_CACHE_TTL} ' - f'--endpoint {endpoint_url} ' - f'{bucket_name} {mount_path}') + '--daemon') return mount_cmd -def get_cos_mount_install_cmd() -> str: - """Returns a command to install IBM COS mount utility rclone.""" +def get_mount_install_cmd_rclone() -> str: + """Returns a command to install mount utility rclone.""" return RCLONE_INSTALL_COMMAND -def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, - bucket_rclone_profile: str, bucket_name: str, - mount_path: str) -> str: - """Returns a command to mount an IBM COS bucket using rclone.""" +def get_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, + bucket_rclone_profile: str, bucket_name: str, + mount_path: str) -> str: + """Returns a command to mount a GCP/AWS bucket using rclone.""" # creates a fusermount soft link on older (<22) Ubuntu systems for # rclone's mount utility. set_fuser3_soft_link = ('[ ! -f /bin/fusermount3 ] && ' @@ -117,7 +114,9 @@ def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, mount_cmd = (f'{configure_rclone_profile} && ' 'rclone mount ' f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' - '--daemon') + '--daemon --daemon --daemon-wait 0 \ + --allow-other --rc --vfs-cache-mode full &&' #todo: figure out if this should be a semicolon or an && + 'rclone rc vfs/refresh') return mount_cmd diff --git a/sky/data/storage.py b/sky/data/storage.py index e782552abbd..a7859b08afc 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -1051,6 +1051,9 @@ def __init__(self, self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, sync_on_reconstruction) + self.bucket_rclone_profile = \ + Rclone.generate_rclone_bucket_profile_name( + self.name, Rclone.RcloneClouds.AWS) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1369,6 +1372,22 @@ def mount_command(self, mount_path: str) -> str: mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) + + def mount_command_rclone(self, mount_path: str) -> str: + install_cmd = mounting_utils.get_mount_install_cmd_rclone() + rclone_config_data = Rclone.get_rclone_config( + self.bucket.name, + Rclone.RcloneClouds.AWS, + None + ) + mount_cmd = mounting_utils.get_mount_cmd_rclone(rclone_config_data, + Rclone.RCLONE_CONFIG_PATH, + self.bucket_rclone_profile, + self.bucket.name, + mount_path) + return mounting_utils.get_mounting_command(mount_path, install_cmd, + mount_cmd) + def _create_s3_bucket(self, bucket_name: str, @@ -1460,6 +1479,9 @@ def __init__(self, self.bucket: StorageHandle super().__init__(name, source, region, is_sky_managed, sync_on_reconstruction) + self.bucket_rclone_profile = \ + Rclone.generate_rclone_bucket_profile_name( + self.name, Rclone.RcloneClouds.GCP) def _validate(self): if self.source is not None: @@ -1812,15 +1834,15 @@ def mount_command(self, mount_path: str) -> str: def mount_command_rclone(self, mount_path: str) -> str: - install_cmd = mounting_utils.get_gcs_mount_install_cmd_rclone() + install_cmd = mounting_utils.get_mount_install_cmd_rclone() rclone_config_data = Rclone.get_rclone_config( self.bucket.name, Rclone.RcloneClouds.GCP, None ) - mount_cmd = mounting_utils.get_gcs_mount_cmd_rclone(rclone_config_data, + mount_cmd = mounting_utils.get_mount_cmd_rclone(rclone_config_data, Rclone.RCLONE_CONFIG_PATH, - Rclone.RcloneClouds.GCP, + self.bucket_rclone_profile, self.bucket.name, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, @@ -2617,7 +2639,7 @@ def mount_command(self, mount_path: str) -> str: mount_path: str; Path to mount the bucket to. """ # install rclone if not installed. - install_cmd = mounting_utils.get_cos_mount_install_cmd() + install_cmd = mounting_utils.get_mount_install_cmd_rclone() rclone_config_data = Rclone.get_rclone_config( self.bucket.name, Rclone.RcloneClouds.IBM, From a4ddd48e79b042c7e5d362ca11bc47c8acfcc0b2 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 21 Apr 2024 02:50:56 -0700 Subject: [PATCH 22/42] minor formatting fix --- sky/data/data_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 46851d81eff..18200c55fe0 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -403,8 +403,8 @@ class Rclone(): # to their respective profile prefix class RcloneClouds(Enum): IBM = 'sky-ibm-' - GCP = 'sky-gcp' - AWS = 'sky-aws' + GCP = 'sky-gcp-' + AWS = 'sky-aws-' @staticmethod def generate_rclone_bucket_profile_name(bucket_name: str, From 32bb4fa09ffa2fafd335d20ce1f49b51f7e9a456 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 26 May 2024 22:49:18 -0700 Subject: [PATCH 23/42] fixed merge conflict --- examples/perf/storage_rawperf.yaml | 31 +++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/examples/perf/storage_rawperf.yaml b/examples/perf/storage_rawperf.yaml index 982a1e7c43a..50a47451877 100644 --- a/examples/perf/storage_rawperf.yaml +++ b/examples/perf/storage_rawperf.yaml @@ -14,6 +14,22 @@ # sky down bench # SkyPilot Storage delete +# SkyPilot Storage benchmarks using fio. +# +# Uses FIO to run benchmarks on SkyPilot Storage. We use Azure's recommended +# parameters for fio to measure storage performance. +# See https://docs.microsoft.com/en-us/azure/virtual-machines/disks-benchmarks +# +# Also measures S3->EBS bandwidth using aws s3 cp, which is used in COPY mode. +# +# Note that random writes are not supported by SkyPilot Storage, and thus +# not benchmarked. +# +# Usage: +# sky launch -y -c bench storage_rawperf.yaml +# sky down bench +# SkyPilot Storage delete + name: storage-demo resources: @@ -22,7 +38,7 @@ resources: file_mounts: /skystorage-mount: - name: sky-romil-benchmark # Make sure this name is unique. + name: sky-hriday-benchmark-vibes-mount # Make sure this name is unique. mode: MOUNT setup: | @@ -32,33 +48,22 @@ setup: | run: | purge_io () { echo "Purging I/O caches..."; sync && echo 3 > sudo tee /proc/sys/vm/drop_caches; } purge_io - echo "Running EBS read benchmark..." - fio --name=64kseqreads --rw=read --direct=1 --ioengine=libaio --bs=64k --numjobs=4 --iodepth=128 --size=1G --group_reporting --directory=/tmp/ --output-format=json > /skystorage-mount/perf_read_ebs.json - purge_io echo "Running S3 read benchmark..." fio --name=64kseqreads --rw=read --direct=1 --ioengine=libaio --bs=64k --numjobs=4 --iodepth=128 --size=1G --group_reporting --directory=/skystorage-mount/ --output-format=json > /skystorage-mount/perf_read_s3.json purge_io - echo "Running EBS write benchmark..." - fio --name=64kseqwrites --rw=write --direct=1 --ioengine=libaio --bs=64k --numjobs=4 --iodepth=128 --size=1G --group_reporting --directory=/tmp/ --output-format=json > /skystorage-mount/perf_write_ebs.json - purge_io echo "Running S3 write benchmark..." fio --name=64kseqwrites --rw=write --direct=1 --ioengine=libaio --bs=64k --numjobs=4 --iodepth=128 --size=1G --group_reporting --directory=/skystorage-mount/ --output-format=json > /skystorage-mount/perf_write_s3.json # Change the s3 path here to the storage bucket name used above echo "Running S3 read benchmark with aws s3 cp (COPY mode setup bandwidth)..." purge_io - /usr/bin/time -o /skystorage-mount/perf_copymode_time.txt -f "%e" /bin/bash -c "aws s3 cp s3://sky-romil-benchmark/64kseqreads.0.0 ~/ > /tmp/perf_copymode_log.txt; sync && echo 3 > sudo tee /proc/sys/vm/drop_caches; " + /usr/bin/time -o /skystorage-mount/perf_copymode_time.txt -f "%e" /bin/bash -c "aws s3 cp s3://sky-hriday-benchmark-vibes-mount/64kseqreads.0.0 ~/ > /tmp/perf_copymode_log.txt; sync && echo 3 > sudo tee /proc/sys/vm/drop_caches; " sed -i -e 's/\r/\n/g' /tmp/perf_copymode_log.txt cp /tmp/perf_copymode_log.txt /skystorage-mount/perf_copymode_log.txt && rm /tmp/perf_copymode_log.txt echo -e '\n===== Benchmark Results =====' echo 'All results are reported as (bandwidth, IOPS)' echo -e '\n##### Sequential Read Results #####' - cat /skystorage-mount/perf_read_ebs.json | python3 -c "import sys, json; data = json.load(sys.stdin)['jobs'][0]; print('EBS:\t{:.2f} MB/s\t{:.2f} IOPS'.format(data['read']['bw_bytes']/(1000*1000), data['read']['iops']))" cat /skystorage-mount/perf_read_s3.json | python3 -c "import sys, json; data = json.load(sys.stdin)['jobs'][0]; print('S3:\t{:.2f} MB/s\t{:.2f} IOPS'.format(data['read']['bw_bytes']/(1000*1000), data['read']['iops']))" echo -e '\n##### Sequential Write Results #####' - cat /skystorage-mount/perf_write_ebs.json | python3 -c "import sys, json; data = json.load(sys.stdin)['jobs'][0]; print('EBS:\t{:.2f} MB/s\t{:.2f} IOPS'.format(data['write']['bw_bytes']/(1000*1000), data['write']['iops']))" cat /skystorage-mount/perf_write_s3.json | python3 -c "import sys, json; data = json.load(sys.stdin)['jobs'][0]; print('S3:\t{:.2f} MB/s\t{:.2f} IOPS'.format(data['write']['bw_bytes']/(1000*1000), data['write']['iops']))" - echo -e '\n##### SkyPilot Storage COPY mode setup bandwidth #####' - echo -n 'aws s3 cp reported bandwidth: ' && ( tail -2 /skystorage-mount/perf_copymode_log.txt | head -1 | grep -o '([^)]* MiB/s)' ) - echo -n 'Actual aws s3 cp -> EBS bandwidth (MB/s): ' && ( bc <<< $(stat -c %s ~/64kseqreads.0.0)/$(cat /skystorage-mount/perf_copymode_time.txt)/1000000) From 0481d2c11b5987bfacb2e49642bd0d5b60c01167 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 26 May 2024 22:49:32 -0700 Subject: [PATCH 24/42] fixed merge conflict --- llm/vicuna-llama-2/train.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llm/vicuna-llama-2/train.yaml b/llm/vicuna-llama-2/train.yaml index 8d35c2dff85..6c5c937863c 100644 --- a/llm/vicuna-llama-2/train.yaml +++ b/llm/vicuna-llama-2/train.yaml @@ -1,7 +1,13 @@ envs: +<<<<<<< Updated upstream HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. ARTIFACT_BUCKET_NAME: # TODO: Fill with your unique bucket name, or use --env to pass. WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass. +======= + HF_TOKEN: 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 # Change to your own huggingface token + ARTIFACT_BUCKET_NAME: YOUR_OWN_BUCKET_NAME # Change to your own bucket name + WANDB_API_KEY: "" # Change to your own wandb api key +>>>>>>> Stashed changes MODEL_SIZE: 7 USE_XFORMERS: 1 From abf4488cc402194041db2fb45d4b51ef9ce2b7e7 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 26 May 2024 22:50:36 -0700 Subject: [PATCH 25/42] fixed merge conflict --- llm/vicuna-llama-2/train.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llm/vicuna-llama-2/train.yaml b/llm/vicuna-llama-2/train.yaml index 6c5c937863c..8d35c2dff85 100644 --- a/llm/vicuna-llama-2/train.yaml +++ b/llm/vicuna-llama-2/train.yaml @@ -1,13 +1,7 @@ envs: -<<<<<<< Updated upstream HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass. ARTIFACT_BUCKET_NAME: # TODO: Fill with your unique bucket name, or use --env to pass. WANDB_API_KEY: # TODO: Fill with your own WANDB_API_KEY, or use --env to pass. -======= - HF_TOKEN: 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 # Change to your own huggingface token - ARTIFACT_BUCKET_NAME: YOUR_OWN_BUCKET_NAME # Change to your own bucket name - WANDB_API_KEY: "" # Change to your own wandb api key ->>>>>>> Stashed changes MODEL_SIZE: 7 USE_XFORMERS: 1 From 00a2a9f50691ac7ae96b85f064783c767c658daf Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 26 May 2024 22:53:32 -0700 Subject: [PATCH 26/42] reset perf example --- examples/perf/storage_rawperf.yaml | 32 +++++++++++++----------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/examples/perf/storage_rawperf.yaml b/examples/perf/storage_rawperf.yaml index 50a47451877..1f6e8616257 100644 --- a/examples/perf/storage_rawperf.yaml +++ b/examples/perf/storage_rawperf.yaml @@ -14,22 +14,6 @@ # sky down bench # SkyPilot Storage delete -# SkyPilot Storage benchmarks using fio. -# -# Uses FIO to run benchmarks on SkyPilot Storage. We use Azure's recommended -# parameters for fio to measure storage performance. -# See https://docs.microsoft.com/en-us/azure/virtual-machines/disks-benchmarks -# -# Also measures S3->EBS bandwidth using aws s3 cp, which is used in COPY mode. -# -# Note that random writes are not supported by SkyPilot Storage, and thus -# not benchmarked. -# -# Usage: -# sky launch -y -c bench storage_rawperf.yaml -# sky down bench -# SkyPilot Storage delete - name: storage-demo resources: @@ -38,7 +22,7 @@ resources: file_mounts: /skystorage-mount: - name: sky-hriday-benchmark-vibes-mount # Make sure this name is unique. + name: sky-romil-benchmark # Make sure this name is unique. mode: MOUNT setup: | @@ -48,22 +32,34 @@ setup: | run: | purge_io () { echo "Purging I/O caches..."; sync && echo 3 > sudo tee /proc/sys/vm/drop_caches; } purge_io + echo "Running EBS read benchmark..." + fio --name=64kseqreads --rw=read --direct=1 --ioengine=libaio --bs=64k --numjobs=4 --iodepth=128 --size=1G --group_reporting --directory=/tmp/ --output-format=json > /skystorage-mount/perf_read_ebs.json + purge_io echo "Running S3 read benchmark..." fio --name=64kseqreads --rw=read --direct=1 --ioengine=libaio --bs=64k --numjobs=4 --iodepth=128 --size=1G --group_reporting --directory=/skystorage-mount/ --output-format=json > /skystorage-mount/perf_read_s3.json purge_io + echo "Running EBS write benchmark..." + fio --name=64kseqwrites --rw=write --direct=1 --ioengine=libaio --bs=64k --numjobs=4 --iodepth=128 --size=1G --group_reporting --directory=/tmp/ --output-format=json > /skystorage-mount/perf_write_ebs.json + purge_io echo "Running S3 write benchmark..." fio --name=64kseqwrites --rw=write --direct=1 --ioengine=libaio --bs=64k --numjobs=4 --iodepth=128 --size=1G --group_reporting --directory=/skystorage-mount/ --output-format=json > /skystorage-mount/perf_write_s3.json # Change the s3 path here to the storage bucket name used above echo "Running S3 read benchmark with aws s3 cp (COPY mode setup bandwidth)..." purge_io - /usr/bin/time -o /skystorage-mount/perf_copymode_time.txt -f "%e" /bin/bash -c "aws s3 cp s3://sky-hriday-benchmark-vibes-mount/64kseqreads.0.0 ~/ > /tmp/perf_copymode_log.txt; sync && echo 3 > sudo tee /proc/sys/vm/drop_caches; " + /usr/bin/time -o /skystorage-mount/perf_copymode_time.txt -f "%e" /bin/bash -c "aws s3 cp s3://sky-romil-benchmark/64kseqreads.0.0 ~/ > /tmp/perf_copymode_log.txt; sync && echo 3 > sudo tee /proc/sys/vm/drop_caches; " sed -i -e 's/\r/\n/g' /tmp/perf_copymode_log.txt cp /tmp/perf_copymode_log.txt /skystorage-mount/perf_copymode_log.txt && rm /tmp/perf_copymode_log.txt echo -e '\n===== Benchmark Results =====' echo 'All results are reported as (bandwidth, IOPS)' echo -e '\n##### Sequential Read Results #####' + cat /skystorage-mount/perf_read_ebs.json | python3 -c "import sys, json; data = json.load(sys.stdin)['jobs'][0]; print('EBS:\t{:.2f} MB/s\t{:.2f} IOPS'.format(data['read']['bw_bytes']/(1000*1000), data['read']['iops']))" cat /skystorage-mount/perf_read_s3.json | python3 -c "import sys, json; data = json.load(sys.stdin)['jobs'][0]; print('S3:\t{:.2f} MB/s\t{:.2f} IOPS'.format(data['read']['bw_bytes']/(1000*1000), data['read']['iops']))" echo -e '\n##### Sequential Write Results #####' + cat /skystorage-mount/perf_write_ebs.json | python3 -c "import sys, json; data = json.load(sys.stdin)['jobs'][0]; print('EBS:\t{:.2f} MB/s\t{:.2f} IOPS'.format(data['write']['bw_bytes']/(1000*1000), data['write']['iops']))" cat /skystorage-mount/perf_write_s3.json | python3 -c "import sys, json; data = json.load(sys.stdin)['jobs'][0]; print('S3:\t{:.2f} MB/s\t{:.2f} IOPS'.format(data['write']['bw_bytes']/(1000*1000), data['write']['iops']))" + echo -e '\n##### SkyPilot Storage COPY mode setup bandwidth #####' + echo -n 'aws s3 cp reported bandwidth: ' && ( tail -2 /skystorage-mount/perf_copymode_log.txt | head -1 | grep -o '([^)]* MiB/s)' ) + echo -n 'Actual aws s3 cp -> EBS bandwidth (MB/s): ' && ( bc <<< $(stat -c %s ~/64kseqreads.0.0)/$(cat /skystorage-mount/perf_copymode_time.txt)/1000000) + \ No newline at end of file From 213eb26207b748a1242c7e63d3ee890ebe7d4893 Mon Sep 17 00:00:00 2001 From: Sheth Date: Sun, 26 May 2024 22:54:20 -0700 Subject: [PATCH 27/42] reset perf example --- examples/perf/storage_rawperf.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/perf/storage_rawperf.yaml b/examples/perf/storage_rawperf.yaml index 1f6e8616257..982a1e7c43a 100644 --- a/examples/perf/storage_rawperf.yaml +++ b/examples/perf/storage_rawperf.yaml @@ -62,4 +62,3 @@ run: | echo -e '\n##### SkyPilot Storage COPY mode setup bandwidth #####' echo -n 'aws s3 cp reported bandwidth: ' && ( tail -2 /skystorage-mount/perf_copymode_log.txt | head -1 | grep -o '([^)]* MiB/s)' ) echo -n 'Actual aws s3 cp -> EBS bandwidth (MB/s): ' && ( bc <<< $(stat -c %s ~/64kseqreads.0.0)/$(cat /skystorage-mount/perf_copymode_time.txt)/1000000) - \ No newline at end of file From 1349ba04db7fe740dd3784f17e374013e317c3ae Mon Sep 17 00:00:00 2001 From: Sheth Date: Mon, 27 May 2024 23:42:09 -0700 Subject: [PATCH 28/42] added tests for rclone functionality --- tests/test_smoke.py | 60 +++++++++++++++++++++++++ tests/test_yamls/test_rclone_mount.yaml | 24 ++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tests/test_yamls/test_rclone_mount.yaml diff --git a/tests/test_smoke.py b/tests/test_smoke.py index db8f684c228..15c16090c26 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -999,6 +999,36 @@ def test_aws_storage_mounts_with_stop(): run_one_test(test) +@pytest.mark.aws +def test_aws_mount_rclone(): + name = _get_cluster_name() + storage_name = f'sky-test-{int(time.time())}' + bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( + storage_name, Rclone.RcloneClouds.AWS) + template_str = pathlib.Path( + 'tests/test_yamls/test_rclone_mount.yaml').read_text() + template = jinja2.Template(template_str) + content = template.render(store_type=f'{storage_lib.StoreType.S3.value}', + storage_name=storage_name, + bucket_rclone_profile=bucket_rclone_profile) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *storage_setup_commands, + f'sky launch -y -c {name} --cloud aws {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + ] + test = Test( + 'aws_mount_rclone', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + @pytest.mark.gcp def test_gcp_storage_mounts_with_stop(): name = _get_cluster_name() @@ -1031,6 +1061,36 @@ def test_gcp_storage_mounts_with_stop(): run_one_test(test) +@pytest.mark.gcp +def test_gcp_mount_rclone(): + name = _get_cluster_name() + storage_name = f'sky-test-{int(time.time())}' + bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( + storage_name, Rclone.RcloneClouds.GCP) + template_str = pathlib.Path( + 'tests/test_yamls/test_rclone_mount.yaml').read_text() + template = jinja2.Template(template_str) + content = template.render(store_type=storage_lib.StoreType.GCS.value, + storage_name=storage_name, + bucket_rclone_profile=bucket_rclone_profile) + with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: + f.write(content) + f.flush() + file_path = f.name + test_commands = [ + *storage_setup_commands, + f'sky launch -y -c {name} --cloud gcp {file_path}', + f'sky logs {name} 1 --status', # Ensure job succeeded. + ] + test = Test( + 'gcp_mount_rclone', + test_commands, + f'sky down -y {name}; sky storage delete -y {storage_name}', + timeout=20 * 60, # 20 mins + ) + run_one_test(test) + + @pytest.mark.kubernetes def test_kubernetes_storage_mounts(): # Tests bucket mounting on k8s, assuming S3 is configured. diff --git a/tests/test_yamls/test_rclone_mount.yaml b/tests/test_yamls/test_rclone_mount.yaml new file mode 100644 index 00000000000..dd9edca6f06 --- /dev/null +++ b/tests/test_yamls/test_rclone_mount.yaml @@ -0,0 +1,24 @@ +file_mounts: + # Mounting private buckets in RCLONE mode + /mount_private_rclone: + name: {{storage_name}} + source: ~/tmp-workdir + store: {{store_type}} + mode: RCLONE + +run: | + set -ex + + # Check private bucket contents + ls -ltr /mount_private_rclone/foo + ls -ltr /mount_private_rclone/tmp\ file + + # Symlinks are not copied to buckets + ! ls /mount_private_rclone/circle-link + + # Write to private bucket in MOUNT mode should pass + echo "hello" > /mount_private_rclone/hello.txt + + # Ensure that write is reflected in bucket + rclone ls {{ bucket_rclone_profile }}:{{ storage_name }}/hello.txt + From 3b0923a213203315d1521ed7758c63a347c0f675 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 30 May 2024 17:09:35 +0000 Subject: [PATCH 29/42] update rclone vfs options --- sky/data/mounting_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 0f2ae5fc1d3..6a582bdb25b 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -115,9 +115,10 @@ def get_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, mount_cmd = (f'{configure_rclone_profile} && ' 'rclone mount ' f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' - '--daemon --daemon --daemon-wait 0 \ - --allow-other --rc --vfs-cache-mode full &&' #todo: figure out if this should be a semicolon or an && - 'rclone rc vfs/refresh') + '--daemon --daemon-wait 0 ' + '--allow-other --rc --vfs-cache-mode full ' + '--transfers 1 --vfs-cache-poll-interval 10s && ' #todo: figure out if this should be a semicolon or an && + 'rclone rc vfs/refresh') return mount_cmd From b82cc5b027969fbaa7d36a592f84449fdd573115 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 13 Jun 2024 03:14:10 +0000 Subject: [PATCH 30/42] testing --- sky/data/mounting_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 6a582bdb25b..ed2e17d76b0 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -116,8 +116,8 @@ def get_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, 'rclone mount ' f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' '--daemon --daemon-wait 0 ' - '--allow-other --rc --vfs-cache-mode full ' - '--transfers 1 --vfs-cache-poll-interval 10s && ' #todo: figure out if this should be a semicolon or an && + '--allow-other --rc --vfs-cache-mode full '#--dir-cache-time 10s ' + '--transfers 1 --vfs-cache-poll-interval 5s && ' #todo: figure out if this should be a semicolon or an && 'rclone rc vfs/refresh') return mount_cmd From bad9172d2f1034c83b4fa6c9f08390b3beb7c81d Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 30 Jun 2024 02:13:41 +0000 Subject: [PATCH 31/42] test --- sky/data/mounting_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index ed2e17d76b0..fe1ab90379a 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -117,7 +117,7 @@ def get_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' '--daemon --daemon-wait 0 ' '--allow-other --rc --vfs-cache-mode full '#--dir-cache-time 10s ' - '--transfers 1 --vfs-cache-poll-interval 5s && ' #todo: figure out if this should be a semicolon or an && + '--transfers 1 && '#--vfs-cache-poll-interval 5s && ' #todo: figure out if this should be a semicolon or an && 'rclone rc vfs/refresh') return mount_cmd From 211f1bc0b4faf927a97f95603da8900a70750de5 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 30 Jun 2024 22:49:24 +0000 Subject: [PATCH 32/42] test --- sky/data/mounting_utils.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index fe1ab90379a..825e56ead3e 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -1,4 +1,5 @@ """Helper functions for object store mounting in Sky Storage""" +import os import random import textwrap from typing import Optional @@ -13,9 +14,12 @@ _RENAME_DIR_LIMIT = 10000 # https://github.com/GoogleCloudPlatform/gcsfuse/releases GCSFUSE_VERSION = '1.3.0' +# TODO(doyoung): need to install specific version of RCLONE RCLONE_INSTALL_COMMAND = ('rclone version >/dev/null 2>&1 || ' '(curl https://rclone.org/install.sh | ' 'sudo bash)') +# TODO(Doyoung): update the way how we keep on track of used ports for rclone remote control +_RCLONE_DEFAULT_RC_PORT = 5572 def get_s3_mount_install_cmd() -> str: """Returns a command to install S3 mount utility goofys.""" @@ -112,12 +116,20 @@ def get_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, f'echo "{rclone_config_data}" >> ' f'{rclone_config_path}') # --daemon will keep the mounting process running in the background. + global _RCLONE_DEFAULT_RC_PORT + _RCLONE_DEFAULT_RC_PORT += 1 + log_path = os.path.expanduser(f'~/.sky/rclone_log/{bucket_name}') + # when mounting multiple directories with vfs cache mode, it's handled by + # rclone to create separate cache directories at ~/.cache/rclone/vfs. It is + # not necessary to specify separate cache directories. mount_cmd = (f'{configure_rclone_profile} && ' 'rclone mount ' f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' '--daemon --daemon-wait 0 ' - '--allow-other --rc --vfs-cache-mode full '#--dir-cache-time 10s ' - '--transfers 1 && '#--vfs-cache-poll-interval 5s && ' #todo: figure out if this should be a semicolon or an && + # need to update the log fiel so it grabs the home directory from the remote instance. + f'--log-file /home/gcpuser/.sky --log-level DEBUG ' #log related flags + f'--allow-other --rc --rc-addr 127.0.0.1:{_RCLONE_DEFAULT_RC_PORT} --vfs-cache-mode full &&'#--dir-cache-time 10s ' + #'--transfers 1 && '#--vfs-cache-poll-interval 5s && ' #todo: figure out if this should be a semicolon or an && 'rclone rc vfs/refresh') return mount_cmd From a312ac11f004d09fcb1473830fa2ccc8923e5d46 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Tue, 2 Jul 2024 04:44:39 +0000 Subject: [PATCH 33/42] update rclone config and rclone command options --- sky/data/data_utils.py | 2 +- sky/data/mounting_utils.py | 41 ++++++++++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 18200c55fe0..ddc18a08136 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -446,7 +446,7 @@ def get_rclone_config(bucket_name: str, cloud: RcloneClouds, [{bucket_rclone_profile}] type = google cloud storage project_number = {clouds.GCP.get_project_id()} - bucket_acl = private + bucket_policy_only = true """) elif cloud is Rclone.RcloneClouds.AWS: config_data = textwrap.dedent(f"""\ diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 825e56ead3e..c801abe8075 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -18,8 +18,6 @@ RCLONE_INSTALL_COMMAND = ('rclone version >/dev/null 2>&1 || ' '(curl https://rclone.org/install.sh | ' 'sudo bash)') -# TODO(Doyoung): update the way how we keep on track of used ports for rclone remote control -_RCLONE_DEFAULT_RC_PORT = 5572 def get_s3_mount_install_cmd() -> str: """Returns a command to install S3 mount utility goofys.""" @@ -116,24 +114,45 @@ def get_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, f'echo "{rclone_config_data}" >> ' f'{rclone_config_path}') # --daemon will keep the mounting process running in the background. - global _RCLONE_DEFAULT_RC_PORT - _RCLONE_DEFAULT_RC_PORT += 1 - log_path = os.path.expanduser(f'~/.sky/rclone_log/{bucket_name}') + # TODO(Doyoung): remove rclone log related scripts and options when done with implementation. + log_dir_path = os.path.expanduser(f'~/.sky/rclone_log') + log_file_path = os.path.join(log_dir_path, f'{bucket_name}.log') + create_log_cmd = f'mkdir -p {log_dir_path} && touch {log_file_path}' # when mounting multiple directories with vfs cache mode, it's handled by # rclone to create separate cache directories at ~/.cache/rclone/vfs. It is # not necessary to specify separate cache directories. - mount_cmd = (f'{configure_rclone_profile} && ' + mount_cmd = (f'{create_log_cmd}; ' + f'{configure_rclone_profile} && ' 'rclone mount ' f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' '--daemon --daemon-wait 0 ' # need to update the log fiel so it grabs the home directory from the remote instance. - f'--log-file /home/gcpuser/.sky --log-level DEBUG ' #log related flags - f'--allow-other --rc --rc-addr 127.0.0.1:{_RCLONE_DEFAULT_RC_PORT} --vfs-cache-mode full &&'#--dir-cache-time 10s ' - #'--transfers 1 && '#--vfs-cache-poll-interval 5s && ' #todo: figure out if this should be a semicolon or an && - 'rclone rc vfs/refresh') + #f'--log-file {log_file_path} --log-level DEBUG ' #log related flags + f'--allow-other --vfs-cache-mode full --dir-cache-time 30s ' + '--transfers 1 && --vfs-cache-poll-interval 5s') return mount_cmd +def _get_mount_binary(mount_cmd: str) -> str: + """Returns mounting binary in string given as the mount command. + + Args: + mount_cmd: str; command used to mount a cloud storage. + + Returns: + str: name of the binary used to mount a cloud storage. + """ + if 'goofys' in mount_cmd: + return 'goofys' + elif 'gcsfuse' in mount_cmd: + return 'gcsfuse' + elif 'blobfuse2' in mount_cmd: + return 'blobfuse2' + else: + assert 'rclone' in mount_cmd + return 'rclone' + + def get_mounting_script( mount_path: str, mount_cmd: str, @@ -158,7 +177,7 @@ def get_mounting_script( str: Mounting script as a str. """ - mount_binary = mount_cmd.split()[0] + mount_binary = _get_mount_binary(mount_cmd) installed_check = f'[ -x "$(command -v {mount_binary})" ]' if version_check_cmd is not None: installed_check += f' && {version_check_cmd}' From 66a8e11a55edd9987ba15acad07018f288cb536c Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Thu, 4 Jul 2024 05:03:25 +0000 Subject: [PATCH 34/42] nit refactor --- sky/backends/cloud_vm_ray_backend.py | 10 ++++--- sky/data/data_utils.py | 39 +++++++++++++++------------- sky/data/mounting_utils.py | 23 +++++++++------- sky/data/storage.py | 10 +++---- 4 files changed, 45 insertions(+), 37 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 88a146b99c7..4e5c09458b7 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4548,7 +4548,7 @@ def _execute_storage_mounts( # Handle cases where `storage_mounts` is None. This occurs when users # initiate a 'sky start' command from a Skypilot version that predates # the introduction of the `storage_mounts_metadata` feature. - if not storage_mounts: + if storage_mounts is None: return # Process only mount mode objects here. COPY mode objects have been @@ -4558,10 +4558,11 @@ def _execute_storage_mounts( path: storage_mount for path, storage_mount in storage_mounts.items() if (storage_mount.mode == storage_lib.StorageMode.MOUNT or - storage_mount.mode == storage_lib.StorageMode.RCLONE) + storage_mount.mode == storage_lib.StorageMode.MOUNT_CACHE) } - # Handle cases when there aren't any Storages with MOUNT mode. + # Handle cases when there aren't any Storages with either MOUNT or + # MOUNT_CACHE mode. if not storage_mounts: return @@ -4591,7 +4592,8 @@ def _execute_storage_mounts( if storage_obj.mode == storage_lib.StorageMode.MOUNT: mount_cmd = store.mount_command(dst) else: - mount_cmd = store.mount_command_rclone(dst) + assert storage_obj.mode == storage_lib.StorageMode.MOUNT_CACHE + mount_cmd = store.mount_cache_command(dst) src_print = (storage_obj.source if storage_obj.source else storage_obj.name) if isinstance(src_print, list): diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index ddc18a08136..a2fc30b46da 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -402,9 +402,9 @@ class Rclone(): # Mapping of storage providers using rclone # to their respective profile prefix class RcloneClouds(Enum): - IBM = 'sky-ibm-' - GCP = 'sky-gcp-' AWS = 'sky-aws-' + GCP = 'sky-gcp-' + IBM = 'sky-ibm-' @staticmethod def generate_rclone_bucket_profile_name(bucket_name: str, @@ -412,33 +412,32 @@ def generate_rclone_bucket_profile_name(bucket_name: str, """Returns rclone profile name for specified bucket Args: - bucket_name (str): name of bucket - cloud (RcloneClouds): enum object of storage provider + bucket_name: str; name of bucket + cloud: RcloneClouds; enum object of storage provider supported via rclone """ try: - return cloud.value + bucket_name + profile_name = f'{cloud.value}{bucket_name}' + return profile_name except AttributeError as e: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Value: {cloud} isn\'t a member of ' + raise ValueError(f'{cloud!r} is not a member of ' 'Rclone.RcloneClouds') from e @staticmethod def get_rclone_config(bucket_name: str, cloud: RcloneClouds, region: Optional[str]) -> str: + """ + """ bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( bucket_name, cloud) - if cloud is Rclone.RcloneClouds.IBM: - access_key_id, secret_access_key = ibm.get_hmac_keys() + if cloud is Rclone.RcloneClouds.AWS: config_data = textwrap.dedent(f"""\ [{bucket_rclone_profile}] type = s3 - provider = IBMCOS - access_key_id = {access_key_id} - secret_access_key = {secret_access_key} - region = {region} - endpoint = s3.{region}.cloud-object-storage.appdomain.cloud - location_constraint = {region}-smart + provider = AWS + access_key_id = {aws.session().get_credentials().get_frozen_credentials().access_key} + secret_access_key = {aws.session().get_credentials().get_frozen_credentials().secret_key} acl = private """) elif cloud is Rclone.RcloneClouds.GCP: @@ -448,13 +447,17 @@ def get_rclone_config(bucket_name: str, cloud: RcloneClouds, project_number = {clouds.GCP.get_project_id()} bucket_policy_only = true """) - elif cloud is Rclone.RcloneClouds.AWS: + elif cloud is Rclone.RcloneClouds.IBM: + access_key_id, secret_access_key = ibm.get_hmac_keys() config_data = textwrap.dedent(f"""\ [{bucket_rclone_profile}] type = s3 - provider = AWS - access_key_id = {aws.session().get_credentials().get_frozen_credentials().access_key} - secret_access_key = {aws.session().get_credentials().get_frozen_credentials().secret_key} + provider = IBMCOS + access_key_id = {access_key_id} + secret_access_key = {secret_access_key} + region = {region} + endpoint = s3.{region}.cloud-object-storage.appdomain.cloud + location_constraint = {region}-smart acl = private """) else: diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index c801abe8075..0fb810273d8 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -14,10 +14,8 @@ _RENAME_DIR_LIMIT = 10000 # https://github.com/GoogleCloudPlatform/gcsfuse/releases GCSFUSE_VERSION = '1.3.0' -# TODO(doyoung): need to install specific version of RCLONE -RCLONE_INSTALL_COMMAND = ('rclone version >/dev/null 2>&1 || ' - '(curl https://rclone.org/install.sh | ' - 'sudo bash)') +# https://github.com/rclone/rclone/releases +RCLONE_VERSION = '1.67.0' def get_s3_mount_install_cmd() -> str: """Returns a command to install S3 mount utility goofys.""" @@ -72,6 +70,16 @@ def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str, return mount_cmd + +def get_rclone_install_cmd() -> str: + """Returns a command to install Rclone.""" + install_cmd = ('wget -nc https://github.com/rclone/rclone/releases' + f'/download/v{RCLONE_VERSION}/rclone-v{RCLONE_VERSION}' + '-linux-amd64.deb -O /tmp/rclone.deb && ' + 'sudo dpkg --install /tmp/rclone.deb') + return install_cmd + + def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, bucket_rclone_profile: str, bucket_name: str, mount_path: str) -> str: @@ -94,12 +102,7 @@ def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, return mount_cmd -def get_mount_install_cmd_rclone() -> str: - """Returns a command to install mount utility rclone.""" - return RCLONE_INSTALL_COMMAND - - -def get_mount_cmd_rclone(rclone_config_data: str, rclone_config_path: str, +def get_mount_cache_cmd(rclone_config_data: str, rclone_config_path: str, bucket_rclone_profile: str, bucket_name: str, mount_path: str) -> str: """Returns a command to mount a GCP/AWS bucket using rclone.""" diff --git a/sky/data/storage.py b/sky/data/storage.py index 88b7ba1dea3..629e93e31ec 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -165,7 +165,7 @@ def store_prefix(self) -> str: class StorageMode(enum.Enum): MOUNT = 'MOUNT' COPY = 'COPY' - RCLONE = 'RCLONE' + MOUNT_CACHE = 'MOUNT_CACHE' class AbstractStore: @@ -1359,14 +1359,14 @@ def mount_command(self, mount_path: str) -> str: return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) - def mount_command_rclone(self, mount_path: str) -> str: + def mount_cache_command(self, mount_path: str) -> str: install_cmd = mounting_utils.get_mount_install_cmd_rclone() rclone_config_data = Rclone.get_rclone_config( self.bucket.name, Rclone.RcloneClouds.AWS, None ) - mount_cmd = mounting_utils.get_mount_cmd_rclone(rclone_config_data, + mount_cmd = mounting_utils.get_mount_cache_cmd(rclone_config_data, Rclone.RCLONE_CONFIG_PATH, self.bucket_rclone_profile, self.bucket.name, @@ -1819,14 +1819,14 @@ def mount_command(self, mount_path: str) -> str: mount_cmd, version_check_cmd) - def mount_command_rclone(self, mount_path: str) -> str: + def mount_cache_command(self, mount_path: str) -> str: install_cmd = mounting_utils.get_mount_install_cmd_rclone() rclone_config_data = Rclone.get_rclone_config( self.bucket.name, Rclone.RcloneClouds.GCP, None ) - mount_cmd = mounting_utils.get_mount_cmd_rclone(rclone_config_data, + mount_cmd = mounting_utils.get_mount_cache_cmd(rclone_config_data, Rclone.RCLONE_CONFIG_PATH, self.bucket_rclone_profile, self.bucket.name, From e628024ed7375008eccb04e85bf8fb03fe2b2519 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Fri, 5 Jul 2024 02:19:05 +0000 Subject: [PATCH 35/42] rclone refactor --- sky/backends/cloud_vm_ray_backend.py | 28 +++- sky/cloud_stores.py | 17 +-- sky/data/data_utils.py | 192 ++++++++++++------------ sky/data/mounting_utils.py | 69 ++++----- sky/data/storage.py | 105 ++++++------- sky/skylet/constants.py | 3 + sky/task.py | 4 +- tests/test_smoke.py | 31 ++-- tests/test_yamls/test_rclone_mount.yaml | 2 +- 9 files changed, 224 insertions(+), 227 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4e5c09458b7..3f79dd91caa 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -56,6 +56,7 @@ from sky.utils import command_runner from sky.utils import common_utils from sky.utils import controller_utils +from sky.utils import env_options from sky.utils import log_utils from sky.utils import resources_utils from sky.utils import rich_utils @@ -4591,9 +4592,11 @@ def _execute_storage_mounts( store = list(storage_obj.stores.values())[0] if storage_obj.mode == storage_lib.StorageMode.MOUNT: mount_cmd = store.mount_command(dst) + action_message = 'Mounting' else: assert storage_obj.mode == storage_lib.StorageMode.MOUNT_CACHE mount_cmd = store.mount_cache_command(dst) + action_message = 'Mounting cache mode' src_print = (storage_obj.source if storage_obj.source else storage_obj.name) if isinstance(src_print, list): @@ -4605,7 +4608,7 @@ def _execute_storage_mounts( target=dst, cmd=mount_cmd, run_rsync=False, - action_message='Mounting', + action_message=action_message, log_path=log_path, # Need to source bashrc, as the cloud specific CLI or SDK # may require PATH in bashrc. @@ -4623,12 +4626,23 @@ def _execute_storage_mounts( f' to an empty or non-existent path.') raise RuntimeError(error_msg) from None else: - # Strip the command (a big heredoc) from the exception - raise exceptions.CommandError( - e.returncode, - command='to mount', - error_msg=e.error_msg, - detailed_reason=e.detailed_reason) from None + # By default, raising an error caused from mounting_utils + # shows a big heredoc as part of it. Here, we want to + # conditionally show the heredoc only if SKYPILOT_DEBUG + # is set + if env_options.Options.SHOW_DEBUG_INFO.get(): + raise exceptions.CommandError( + e.returncode, + command='to mount', + error_msg=e.error_msg, + detailed_reason=e.detailed_reason) + else: + # Strip the command (a big heredoc) from the exception + raise exceptions.CommandError( + e.returncode, + command='to mount', + error_msg=e.error_msg, + detailed_reason=e.detailed_reason) from None end = time.time() logger.debug(f'Storage mount sync took {end - start} seconds.') diff --git a/sky/cloud_stores.py b/sky/cloud_stores.py index db20b531cb8..a5273feb27d 100644 --- a/sky/cloud_stores.py +++ b/sky/cloud_stores.py @@ -15,7 +15,7 @@ from sky.adaptors import ibm from sky.clouds import gcp from sky.data import data_utils -from sky.data.data_utils import Rclone +from sky.data import mounting_utils class CloudStorage: @@ -262,19 +262,18 @@ def is_directory(self, url: str) -> bool: def _get_rclone_sync_command(self, source: str, destination: str): bucket_name, data_path, bucket_region = data_utils.split_cos_path( source) - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, Rclone.RcloneClouds.IBM) - data_path_in_bucket = bucket_name + data_path - rclone_config_data = Rclone.get_rclone_config(bucket_name, - Rclone.RcloneClouds.IBM, - bucket_region) + rclone_profile_name = data_utils.Rclone.RcloneStores.IBM.get_profile_name( + bucket_name) + data_path_in_bucket = f'{bucket_name}{data_path}' + rclone_config = data_utils.Rclone.RcloneStores.IBM.get_config( + rclone_profile_name=rclone_profile_name, region=bucket_region) # configure_rclone stores bucket profile in remote cluster's rclone.conf configure_rclone = ( f' mkdir -p ~/.config/rclone/ &&' - f' echo "{rclone_config_data}">> {Rclone.RCLONE_CONFIG_PATH}') + f' echo "{rclone_config}">> {mounting_utils.RCLONE_CONFIG_PATH}') download_via_rclone = ( 'rclone copy ' - f'{bucket_rclone_profile}:{data_path_in_bucket} {destination}') + f'{rclone_profile_name}:{data_path_in_bucket} {destination}') all_commands = list(self._GET_RCLONE) all_commands.append(configure_rclone) diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index a2fc30b46da..7cb395a52f2 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -1,7 +1,7 @@ """Miscellaneous Utils for Sky Data """ import concurrent.futures -from enum import Enum +import enum from multiprocessing import pool import os import re @@ -19,6 +19,7 @@ from sky.adaptors import cloudflare from sky.adaptors import gcp from sky.adaptors import ibm +from sky.skylet import constants from sky.utils import ux_utils Client = Any @@ -388,94 +389,90 @@ def get_cos_regions() -> List[str]: ] -class Rclone(): +class Rclone: """Static class implementing common utilities of rclone without rclone sdk. Storage providers supported by rclone are required to: - - list their rclone profile prefix in RcloneClouds - - implement configuration in get_rclone_config() + - list their rclone profile prefix in RcloneStores + - implement configuration in get_config() """ - RCLONE_CONFIG_PATH = '~/.config/rclone/rclone.conf' - _RCLONE_ABS_CONFIG_PATH = os.path.expanduser(RCLONE_CONFIG_PATH) - # Mapping of storage providers using rclone # to their respective profile prefix - class RcloneClouds(Enum): - AWS = 'sky-aws-' - GCP = 'sky-gcp-' - IBM = 'sky-ibm-' - - @staticmethod - def generate_rclone_bucket_profile_name(bucket_name: str, - cloud: RcloneClouds) -> str: - """Returns rclone profile name for specified bucket - - Args: - bucket_name: str; name of bucket - cloud: RcloneClouds; enum object of storage provider - supported via rclone - """ - try: - profile_name = f'{cloud.value}{bucket_name}' - return profile_name - except AttributeError as e: - with ux_utils.print_exception_no_traceback(): - raise ValueError(f'{cloud!r} is not a member of ' - 'Rclone.RcloneClouds') from e - - @staticmethod - def get_rclone_config(bucket_name: str, cloud: RcloneClouds, - region: Optional[str]) -> str: - """ - """ - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, cloud) - if cloud is Rclone.RcloneClouds.AWS: - config_data = textwrap.dedent(f"""\ - [{bucket_rclone_profile}] - type = s3 - provider = AWS - access_key_id = {aws.session().get_credentials().get_frozen_credentials().access_key} - secret_access_key = {aws.session().get_credentials().get_frozen_credentials().secret_key} - acl = private - """) - elif cloud is Rclone.RcloneClouds.GCP: - config_data = textwrap.dedent(f"""\ - [{bucket_rclone_profile}] - type = google cloud storage - project_number = {clouds.GCP.get_project_id()} - bucket_policy_only = true - """) - elif cloud is Rclone.RcloneClouds.IBM: - access_key_id, secret_access_key = ibm.get_hmac_keys() - config_data = textwrap.dedent(f"""\ - [{bucket_rclone_profile}] - type = s3 - provider = IBMCOS - access_key_id = {access_key_id} - secret_access_key = {secret_access_key} - region = {region} - endpoint = s3.{region}.cloud-object-storage.appdomain.cloud - location_constraint = {region}-smart - acl = private - """) - else: - with ux_utils.print_exception_no_traceback(): - raise NotImplementedError('No rclone configuration builder was ' - f'implemented for cloud: {cloud}.') - return config_data + class RcloneStores(enum.Enum): + S3 = 'S3' + GCS = 'GCS' + IBM = 'IBM' + + def get_profile_name(self, bucket_name: str) -> str: + """ + """ + if self is Rclone.RcloneStores.S3: + return f'sky-s3-{bucket_name}' + elif self is Rclone.RcloneStores.GCS: + return f'sky-gcs-{bucket_name}' + elif self is Rclone.RcloneStores.IBM: + return f'sky-ibm-{bucket_name}' + else: + with ux_utils.print_exception_no_traceback(): + raise NotImplementedError( + f'Unsupported store type for Rclone: {self}') + + def get_config(self, + bucket_name: Optional[str] = None, + rclone_profile_name: Optional[str] = None, + region: Optional[str] = None) -> str: + """ + """ + if rclone_profile_name is None: + assert bucket_name is not None + rclone_profile_name = self.get_profile_name(bucket_name) + if self is Rclone.RcloneStores.S3: + aws_credentials = ( + aws.session().get_credentials().get_frozen_credentials()) + access_key_id = aws_credentials.access_key + secret_access_key = aws_credentials.secret_key + config = textwrap.dedent(f"""\ + [{rclone_profile_name}] + type = s3 + provider = AWS + access_key_id = {access_key_id} + secret_access_key = {secret_access_key} + acl = private + """) + elif self is Rclone.RcloneStores.GCS: + config = textwrap.dedent(f"""\ + [{rclone_profile_name}] + type = google cloud storage + project_number = {clouds.GCP.get_project_id()} + bucket_policy_only = true + """) + elif self is Rclone.RcloneStores.IBM: + access_key_id, secret_access_key = ibm.get_hmac_keys() + config = textwrap.dedent(f"""\ + [{rclone_profile_name}] + type = s3 + provider = IBMCOS + access_key_id = {access_key_id} + secret_access_key = {secret_access_key} + region = {region} + endpoint = s3.{region}.cloud-object-storage.appdomain.cloud + location_constraint = {region}-smart + acl = private + """) + else: + with ux_utils.print_exception_no_traceback(): + raise NotImplementedError( + f'Unsupported store type for Rclone: {self}') + return config @staticmethod - def store_rclone_config(bucket_name: str, cloud: RcloneClouds, + def store_rclone_config(bucket_name: str, cloud: RcloneStores, region: str) -> str: """Creates a configuration files for rclone - used for bucket syncing and mounting """ - - rclone_config_path = Rclone._RCLONE_ABS_CONFIG_PATH - config_data = Rclone.get_rclone_config(bucket_name, cloud, region) - - # Raise exception if rclone isn't installed + rclone_config_path = os.path.expanduser(constants.RCLONE_CONFIG_PATH) + config_data = cloud.get_config(bucket_name=bucket_name, region=region) try: subprocess.run('rclone version', shell=True, @@ -489,9 +486,7 @@ def store_rclone_config(bucket_name: str, cloud: RcloneClouds, '"curl https://rclone.org/install.sh ' '| sudo bash" ') from None - # create ~/.config/rclone/ if doesn't exist os.makedirs(os.path.dirname(rclone_config_path), exist_ok=True) - # create rclone.conf if doesn't exist if not os.path.isfile(rclone_config_path): open(rclone_config_path, 'w', encoding='utf-8').close() @@ -512,18 +507,17 @@ def store_rclone_config(bucket_name: str, cloud: RcloneClouds, return config_data @staticmethod - def get_region_from_rclone(bucket_name: str, cloud: RcloneClouds) -> str: + def get_region_from_rclone(bucket_name: str, cloud: RcloneStores) -> str: """Returns region field of the specified bucket in rclone.conf if bucket exists, else empty string""" - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, cloud) - with open(Rclone._RCLONE_ABS_CONFIG_PATH, 'r', - encoding='utf-8') as file: + rclone_profile = cloud.get_profile_name(bucket_name) + rclone_config_path = os.path.expanduser(constants.RCLONE_CONFIG_PATH) + with open(rclone_config_path, 'r', encoding='utf-8') as file: bucket_profile_found = False for line in file: if line.lstrip().startswith('#'): # skip user's comments. continue - if line.strip() == f'[{bucket_rclone_profile}]': + if line.strip() == f'[{rclone_profile}]': bucket_profile_found = True elif bucket_profile_found and line.startswith('region'): return line.split('=')[1].strip() @@ -535,36 +529,34 @@ def get_region_from_rclone(bucket_name: str, cloud: RcloneClouds) -> str: return '' @staticmethod - def delete_rclone_bucket_profile(bucket_name: str, cloud: RcloneClouds): + def delete_rclone_bucket_profile(bucket_name: str, cloud: RcloneStores): """Deletes specified bucket profile for rclone.conf""" - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, cloud) - rclone_config_path = Rclone._RCLONE_ABS_CONFIG_PATH + rclone_profile = cloud.get_profile_name(bucket_name) + rclone_config_path = os.path.expanduser(constants.RCLONE_CONFIG_PATH) if not os.path.isfile(rclone_config_path): - logger.warning( - 'Failed to locate "rclone.conf" while ' - f'trying to delete rclone profile: {bucket_rclone_profile}') + logger.warning('Failed to locate "rclone.conf" while ' + f'trying to delete rclone profile: {rclone_profile}') return with FileLock(rclone_config_path + '.lock'): profiles_to_keep = Rclone._remove_bucket_profile_rclone( bucket_name, cloud) - # write back file without profile: [bucket_rclone_profile] + # write back file without profile: [rclone_profile] with open(f'{rclone_config_path}', 'w', encoding='utf-8') as file: file.writelines(profiles_to_keep) @staticmethod def _remove_bucket_profile_rclone(bucket_name: str, - cloud: RcloneClouds) -> List[str]: + cloud: RcloneStores) -> List[str]: """Returns rclone profiles without profiles matching - [profile_prefix+bucket_name]""" - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, cloud) - rclone_config_path = Rclone._RCLONE_ABS_CONFIG_PATH + [profile_prefix+bucket_name] + """ + rclone_profile_name = cloud.get_profile_name(bucket_name) + rclone_config_path = os.path.expanduser(constants.RCLONE_CONFIG_PATH) - with open(f'{rclone_config_path}', 'r', encoding='utf-8') as file: + with open(rclone_config_path, 'r', encoding='utf-8') as file: lines = file.readlines() # returns a list of the file's lines # delete existing bucket profile matching: # '[profile_prefix+bucket_name]' @@ -577,7 +569,7 @@ def _remove_bucket_profile_rclone(bucket_name: str, # keep user comments only if they aren't under # a profile we are discarding lines_to_keep.append(line) - elif f'[{bucket_rclone_profile}]' in line: + elif f'[{rclone_profile_name}]' in line: skip_lines = True elif skip_lines: if '[' in line: diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 0fb810273d8..a75009d09db 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -5,6 +5,7 @@ from typing import Optional from sky import exceptions +from sky.skylet import constants from sky.utils import command_runner # Values used to construct mounting commands @@ -16,6 +17,12 @@ GCSFUSE_VERSION = '1.3.0' # https://github.com/rclone/rclone/releases RCLONE_VERSION = '1.67.0' +# Creates a fusermount3 soft link on older (<22) Ubuntu systems to utilize +# Rclone's mounting utility. +FUSERMOUNT3_SOFT_LINK_CMD = ('[ ! -f /bin/fusermount3 ] && ' + 'sudo ln -s /bin/fusermount /bin/fusermount3 || ' + 'true') + def get_s3_mount_install_cmd() -> str: """Returns a command to install S3 mount utility goofys.""" @@ -70,7 +77,6 @@ def get_r2_mount_cmd(r2_credentials_path: str, r2_profile_name: str, return mount_cmd - def get_rclone_install_cmd() -> str: """Returns a command to install Rclone.""" install_cmd = ('wget -nc https://github.com/rclone/rclone/releases' @@ -80,59 +86,48 @@ def get_rclone_install_cmd() -> str: return install_cmd -def get_cos_mount_cmd(rclone_config_data: str, rclone_config_path: str, - bucket_rclone_profile: str, bucket_name: str, - mount_path: str) -> str: +def get_cos_mount_cmd(rclone_config: str, rclone_profile_name: str, + bucket_name: str, mount_path: str) -> str: """Returns a command to mount an IBM COS bucket using rclone.""" - # creates a fusermount soft link on older (<22) Ubuntu systems for - # rclone's mount utility. - set_fuser3_soft_link = ('[ ! -f /bin/fusermount3 ] && ' - 'sudo ln -s /bin/fusermount /bin/fusermount3 || ' - 'true') # stores bucket profile in rclone config file at the cluster's nodes. - configure_rclone_profile = (f'{set_fuser3_soft_link}; ' - 'mkdir -p ~/.config/rclone/ && ' - f'echo "{rclone_config_data}" >> ' - f'{rclone_config_path}') + configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; ' + f'mkdir -p {constants.RCLONE_CONFIG_DIR} && ' + f'echo "{rclone_config}" >> ' + f'{constants.RCLONE_CONFIG_PATH}') # --daemon will keep the mounting process running in the background. mount_cmd = (f'{configure_rclone_profile} && ' 'rclone mount ' - f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' + f'{rclone_profile_name}:{bucket_name} {mount_path} ' '--daemon') return mount_cmd -def get_mount_cache_cmd(rclone_config_data: str, rclone_config_path: str, - bucket_rclone_profile: str, bucket_name: str, - mount_path: str) -> str: +def get_mount_cache_cmd(rclone_config: str, rclone_profile_name: str, + bucket_name: str, mount_path: str) -> str: """Returns a command to mount a GCP/AWS bucket using rclone.""" - # creates a fusermount soft link on older (<22) Ubuntu systems for - # rclone's mount utility. - set_fuser3_soft_link = ('[ ! -f /bin/fusermount3 ] && ' - 'sudo ln -s /bin/fusermount /bin/fusermount3 || ' - 'true') - # stores bucket profile in rclone config file at the cluster's nodes. - configure_rclone_profile = (f'{set_fuser3_soft_link}; ' - 'mkdir -p ~/.config/rclone/ && ' - f'echo "{rclone_config_data}" >> ' - f'{rclone_config_path}') + # stores bucket profile in rclone config file at the remote nodes. + configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; ' + f'mkdir -p {constants.RCLONE_CONFIG_DIR} && ' + f'echo "{rclone_config}" >> ' + f'{constants.RCLONE_CONFIG_PATH}') # --daemon will keep the mounting process running in the background. # TODO(Doyoung): remove rclone log related scripts and options when done with implementation. - log_dir_path = os.path.expanduser(f'~/.sky/rclone_log') + log_dir_path = os.path.expanduser('~/.sky/rclone_log') log_file_path = os.path.join(log_dir_path, f'{bucket_name}.log') create_log_cmd = f'mkdir -p {log_dir_path} && touch {log_file_path}' # when mounting multiple directories with vfs cache mode, it's handled by # rclone to create separate cache directories at ~/.cache/rclone/vfs. It is # not necessary to specify separate cache directories. - mount_cmd = (f'{create_log_cmd}; ' - f'{configure_rclone_profile} && ' - 'rclone mount ' - f'{bucket_rclone_profile}:{bucket_name} {mount_path} ' - '--daemon --daemon-wait 0 ' - # need to update the log fiel so it grabs the home directory from the remote instance. - #f'--log-file {log_file_path} --log-level DEBUG ' #log related flags - f'--allow-other --vfs-cache-mode full --dir-cache-time 30s ' - '--transfers 1 && --vfs-cache-poll-interval 5s') + mount_cmd = ( + #f'{create_log_cmd}; ' + f'{configure_rclone_profile} && ' + 'rclone mount ' + f'{rclone_profile_name}:{bucket_name} {mount_path} ' + '--daemon --daemon-wait 0 ' + # need to update the log fiel so it grabs the home directory from the remote instance. + #f'--log-file {log_file_path} --log-level DEBUG ' #log related flags + '--allow-other --vfs-cache-mode full --dir-cache-time 30s ' + '--transfers 1 --vfs-cache-poll-interval 5s') return mount_cmd diff --git a/sky/data/storage.py b/sky/data/storage.py index 629e93e31ec..1560e1a6a18 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -25,7 +25,6 @@ from sky.data import data_utils from sky.data import mounting_utils from sky.data import storage_utils -from sky.data.data_utils import Rclone from sky.utils import common_utils from sky.utils import rich_utils from sky.utils import schemas @@ -312,7 +311,19 @@ def _download_file(self, remote_path: str, local_path: str) -> None: def mount_command(self, mount_path: str) -> str: """Returns the command to mount the Store to the specified mount_path. - Includes the setup commands to install mounting tools. + This command is used for MOUNT mode. Includes the setup commands to + install mounting tools. + + Args: + mount_path: str; Mount path on remote server + """ + raise NotImplementedError + + def mount_cache_command(self, mount_path: str) -> str: + """Returns the command to mount the Store to the specified mount_path. + + This command is used for MOUNT_CACHE mode. Includes the setup commands + to install mounting tools. Args: mount_path: str; Mount path on remote server @@ -1037,9 +1048,6 @@ def __init__(self, self.bucket: 'StorageHandle' super().__init__(name, source, region, is_sky_managed, sync_on_reconstruction) - self.bucket_rclone_profile = \ - Rclone.generate_rclone_bucket_profile_name( - self.name, Rclone.RcloneClouds.AWS) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -1358,22 +1366,19 @@ def mount_command(self, mount_path: str) -> str: mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) - + def mount_cache_command(self, mount_path: str) -> str: - install_cmd = mounting_utils.get_mount_install_cmd_rclone() - rclone_config_data = Rclone.get_rclone_config( - self.bucket.name, - Rclone.RcloneClouds.AWS, - None - ) - mount_cmd = mounting_utils.get_mount_cache_cmd(rclone_config_data, - Rclone.RCLONE_CONFIG_PATH, - self.bucket_rclone_profile, - self.bucket.name, - mount_path) + install_cmd = mounting_utils.get_rclone_install_cmd() + rclone_profile_name = ( + data_utils.Rclone.RcloneStores.S3.get_profile_name(self.name)) + rclone_config = data_utils.Rclone.RcloneStores.S3.get_config( + rclone_profile_name=rclone_profile_name) + mount_cmd = mounting_utils.get_mount_cache_cmd(rclone_config, + rclone_profile_name, + self.bucket.name, + mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, - mount_cmd) - + mount_cmd) def _create_s3_bucket(self, bucket_name: str, @@ -1465,9 +1470,6 @@ def __init__(self, self.bucket: StorageHandle super().__init__(name, source, region, is_sky_managed, sync_on_reconstruction) - self.bucket_rclone_profile = \ - Rclone.generate_rclone_bucket_profile_name( - self.name, Rclone.RcloneClouds.GCP) def _validate(self): if self.source is not None: @@ -1817,23 +1819,19 @@ def mount_command(self, mount_path: str) -> str: f'gcsfuse --version | grep -q {mounting_utils.GCSFUSE_VERSION}') return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd, version_check_cmd) - - + def mount_cache_command(self, mount_path: str) -> str: - install_cmd = mounting_utils.get_mount_install_cmd_rclone() - rclone_config_data = Rclone.get_rclone_config( - self.bucket.name, - Rclone.RcloneClouds.GCP, - None - ) - mount_cmd = mounting_utils.get_mount_cache_cmd(rclone_config_data, - Rclone.RCLONE_CONFIG_PATH, - self.bucket_rclone_profile, - self.bucket.name, - mount_path) + install_cmd = mounting_utils.get_rclone_install_cmd() + rclone_profile_name = data_utils.Rclone.RcloneStores.GCS.get_profile_name( + self.name) + rclone_config = data_utils.Rclone.RcloneStores.GCS.get_config( + rclone_profile_name=rclone_profile_name) + mount_cmd = mounting_utils.get_mount_cache_cmd(rclone_config, + rclone_profile_name, + self.bucket.name, + mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, - mount_cmd) - + mount_cmd) def _download_file(self, remote_path: str, local_path: str) -> None: """Downloads file from remote to local on GS bucket @@ -2299,11 +2297,10 @@ def __init__(self, sync_on_reconstruction: bool = True): self.client: 'storage.Client' self.bucket: 'StorageHandle' + self.rclone_profile_name = ( + data_utils.Rclone.RcloneStores.IBM.get_profile_name(self.name)) super().__init__(name, source, region, is_sky_managed, sync_on_reconstruction) - self.bucket_rclone_profile = \ - Rclone.generate_rclone_bucket_profile_name( - self.name, Rclone.RcloneClouds.IBM) def _validate(self): if self.source is not None and isinstance(self.source, str): @@ -2482,7 +2479,7 @@ def get_dir_sync_command(src_dir_path, dest_dir_name) -> str: sync_command = ( 'rclone copy --exclude ".git/*" ' f'{src_dir_path} ' - f'{self.bucket_rclone_profile}:{self.name}/{dest_dir_name}') + f'{self.rclone_profile_name}:{self.name}/{dest_dir_name}') return sync_command def get_file_sync_command(base_dir_path, file_names) -> str: @@ -2510,7 +2507,7 @@ def get_file_sync_command(base_dir_path, file_names) -> str: base_dir_path = shlex.quote(base_dir_path) sync_command = ('rclone copy ' f'{includes} {base_dir_path} ' - f'{self.bucket_rclone_profile}:{self.name}') + f'{self.rclone_profile_name}:{self.name}') return sync_command # Generate message for upload @@ -2549,7 +2546,7 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: 'sky storage delete' or 'sky start' """ - bucket_profile_name = Rclone.RcloneClouds.IBM.value + self.name + bucket_profile_name = data_utils.Rclone.RcloneStores.IBM.value + self.name try: bucket_region = data_utils.get_ibm_cos_bucket_region(self.name) except exceptions.StorageBucketGetError as e: @@ -2584,9 +2581,9 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]: '`rclone lsd ` on relevant remotes returned ' 'via `rclone listremotes` to debug.') - Rclone.store_rclone_config( + data_utils.Rclone.store_rclone_config( self.name, - Rclone.RcloneClouds.IBM, + data_utils.Rclone.RcloneStores.IBM, self.region, # type: ignore ) if not bucket_region and self.sync_on_reconstruction: @@ -2625,15 +2622,12 @@ def mount_command(self, mount_path: str) -> str: mount_path: str; Path to mount the bucket to. """ # install rclone if not installed. - install_cmd = mounting_utils.get_mount_install_cmd_rclone() - rclone_config_data = Rclone.get_rclone_config( - self.bucket.name, - Rclone.RcloneClouds.IBM, - self.region, # type: ignore - ) - mount_cmd = mounting_utils.get_cos_mount_cmd(rclone_config_data, - Rclone.RCLONE_CONFIG_PATH, - self.bucket_rclone_profile, + install_cmd = mounting_utils.get_rclone_install_cmd() + rclone_config = data_utils.Rclone.RcloneStores.IBM.get_config( + rclone_profile_name=self.rclone_profile_name, + region=self.region) # type: ignore + mount_cmd = mounting_utils.get_cos_mount_cmd(rclone_config, + self.rclone_profile_name, self.bucket.name, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, @@ -2685,4 +2679,5 @@ def _delete_cos_bucket(self): except ibm.ibm_botocore.exceptions.ClientError as e: if e.__class__.__name__ == 'NoSuchBucket': logger.debug('bucket already removed') - Rclone.delete_rclone_bucket_profile(self.name, Rclone.RcloneClouds.IBM) + data_utils.Rclone.delete_rclone_bucket_profile( + self.name, data_utils.Rclone.RcloneStores.IBM) diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 0c68fd7f6e6..50c6323b452 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -217,3 +217,6 @@ # Serve: A default controller with 4 vCPU and 16 GB memory can run up to 16 # services. CONTROLLER_PROCESS_CPU_DEMAND = 0.25 + +RCLONE_CONFIG_DIR = '~/.config/rclone' +RCLONE_CONFIG_PATH = f'{RCLONE_CONFIG_DIR}/rclone.conf' \ No newline at end of file diff --git a/sky/task.py b/sky/task.py index a94d871715f..c196b95ea73 100644 --- a/sky/task.py +++ b/sky/task.py @@ -988,7 +988,7 @@ def sync_storage_mounts(self) -> None: assert storage.name is not None, storage # extract region from rclone.conf cos_region = data_utils.Rclone.get_region_from_rclone( - storage.name, data_utils.Rclone.RcloneClouds.IBM) + storage.name, data_utils.Rclone.RcloneStores.IBM) blob_path = f'cos://{cos_region}/{storage.name}' self.update_file_mounts({mnt_path: blob_path}) elif store_type is storage_lib.StoreType.AZURE: @@ -1113,7 +1113,7 @@ def get_required_cloud_features( # Storage mounting for _, storage_mount in self.storage_mounts.items(): if (storage_mount.mode == storage_lib.StorageMode.MOUNT or - storage_mount.mode == storage_lib.StorageMode.RCLONE): + storage_mount.mode == storage_lib.StorageMode.MOUNT_CACHE): required_features.add( clouds.CloudImplementationFeatures.STORAGE_MOUNTING) break diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 15c16090c26..b6038235699 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -54,7 +54,6 @@ from sky.clouds import GCP from sky.data import data_utils from sky.data import storage as storage_lib -from sky.data.data_utils import Rclone from sky.skylet import events from sky.utils import common_utils from sky.utils import resources_utils @@ -1003,14 +1002,14 @@ def test_aws_storage_mounts_with_stop(): def test_aws_mount_rclone(): name = _get_cluster_name() storage_name = f'sky-test-{int(time.time())}' - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - storage_name, Rclone.RcloneClouds.AWS) + rclone_profile_name = data_utils.Rclone.RcloneStores.S3.get_profile_name( + storage_name) template_str = pathlib.Path( 'tests/test_yamls/test_rclone_mount.yaml').read_text() template = jinja2.Template(template_str) content = template.render(store_type=f'{storage_lib.StoreType.S3.value}', storage_name=storage_name, - bucket_rclone_profile=bucket_rclone_profile) + rclone_profile_name=rclone_profile_name) with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: f.write(content) f.flush() @@ -1065,14 +1064,14 @@ def test_gcp_storage_mounts_with_stop(): def test_gcp_mount_rclone(): name = _get_cluster_name() storage_name = f'sky-test-{int(time.time())}' - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - storage_name, Rclone.RcloneClouds.GCP) + rclone_profile_name = data_utils.Rclone.RcloneStores.GCS.get_profile_name( + storage_name) template_str = pathlib.Path( 'tests/test_yamls/test_rclone_mount.yaml').read_text() template = jinja2.Template(template_str) content = template.render(store_type=storage_lib.StoreType.GCS.value, storage_name=storage_name, - bucket_rclone_profile=bucket_rclone_profile) + rclone_profile_name=rclone_profile_name) with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: f.write(content) f.flush() @@ -1195,8 +1194,8 @@ def test_cloudflare_storage_mounts(generic_cloud: str): def test_ibm_storage_mounts(): name = _get_cluster_name() storage_name = f'sky-test-{int(time.time())}' - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - storage_name, Rclone.RcloneClouds.IBM) + rclone_profile_name = data_utils.Rclone.RcloneStores.IBM.get_profile_name( + storage_name) template_str = pathlib.Path( 'tests/test_yamls/test_ibm_cos_storage_mounting.yaml').read_text() template = jinja2.Template(template_str) @@ -1209,7 +1208,7 @@ def test_ibm_storage_mounts(): *storage_setup_commands, f'sky launch -y -c {name} --cloud ibm {file_path}', f'sky logs {name} 1 --status', # Ensure job succeeded. - f'rclone ls {bucket_rclone_profile}:{storage_name}/hello.txt', + f'rclone ls {rclone_profile_name}:{storage_name}/hello.txt', ] test = Test( 'ibm_storage_mounts', @@ -4103,9 +4102,9 @@ def cli_delete_cmd(store_type, bucket_name): url = f's3://{bucket_name}' return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 rb {url} --force --endpoint {endpoint_url} --profile=r2' if store_type == storage_lib.StoreType.IBM: - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, Rclone.RcloneClouds.IBM) - return f'rclone purge {bucket_rclone_profile}:{bucket_name} && rclone config delete {bucket_rclone_profile}' + rclone_profile_name = (data_utils.Rclone.RcloneStores.IBM. + get_profile_name(bucket_name)) + return f'rclone purge {rclone_profile_name}:{bucket_name} && rclone config delete {rclone_profile_name}' @staticmethod def cli_ls_cmd(store_type, bucket_name, suffix=''): @@ -4129,9 +4128,9 @@ def cli_ls_cmd(store_type, bucket_name, suffix=''): url = f's3://{bucket_name}' return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2' if store_type == storage_lib.StoreType.IBM: - bucket_rclone_profile = Rclone.generate_rclone_bucket_profile_name( - bucket_name, Rclone.RcloneClouds.IBM) - return f'rclone ls {bucket_rclone_profile}:{bucket_name}/{suffix}' + rclone_profile_name = (data_utils.Rclone.RcloneStores.IBM. + get_profile_name(bucket_name)) + return f'rclone ls {rclone_profile_name}:{bucket_name}/{suffix}' @staticmethod def cli_region_cmd(store_type, bucket_name): diff --git a/tests/test_yamls/test_rclone_mount.yaml b/tests/test_yamls/test_rclone_mount.yaml index dd9edca6f06..f92ff4668eb 100644 --- a/tests/test_yamls/test_rclone_mount.yaml +++ b/tests/test_yamls/test_rclone_mount.yaml @@ -20,5 +20,5 @@ run: | echo "hello" > /mount_private_rclone/hello.txt # Ensure that write is reflected in bucket - rclone ls {{ bucket_rclone_profile }}:{{ storage_name }}/hello.txt + rclone ls {{ rclone_profile_name }}:{{ storage_name }}/hello.txt From 0976fa1b2fbd2a23391caa3cf4b491a15da1badf Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 6 Jul 2024 18:28:45 +0000 Subject: [PATCH 36/42] nit --- sky/cloud_stores.py | 6 +++--- sky/data/data_utils.py | 42 ++++++++++++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/sky/cloud_stores.py b/sky/cloud_stores.py index a5273feb27d..5ed7053dcb1 100644 --- a/sky/cloud_stores.py +++ b/sky/cloud_stores.py @@ -15,7 +15,7 @@ from sky.adaptors import ibm from sky.clouds import gcp from sky.data import data_utils -from sky.data import mounting_utils +from sky.skylet import constants class CloudStorage: @@ -269,8 +269,8 @@ def _get_rclone_sync_command(self, source: str, destination: str): rclone_profile_name=rclone_profile_name, region=bucket_region) # configure_rclone stores bucket profile in remote cluster's rclone.conf configure_rclone = ( - f' mkdir -p ~/.config/rclone/ &&' - f' echo "{rclone_config}">> {mounting_utils.RCLONE_CONFIG_PATH}') + f' mkdir -p {constants.RCLONE_CONFIG_DIR} &&' + f' echo "{rclone_config}">> {constants.RCLONE_CONFIG_PATH}') download_via_rclone = ( 'rclone copy ' f'{rclone_profile_name}:{data_path_in_bucket} {destination}') diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 7cb395a52f2..376b4fd1bea 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -405,24 +405,42 @@ class RcloneStores(enum.Enum): IBM = 'IBM' def get_profile_name(self, bucket_name: str) -> str: + """Gets the Rclone profile name for a given bucket. + + Args: + bucket_name: The name of the bucket. + + Returns: + A string containing the Rclone profile name, which combines + prefix based on the storage type and the bucket name. """ - """ - if self is Rclone.RcloneStores.S3: - return f'sky-s3-{bucket_name}' - elif self is Rclone.RcloneStores.GCS: - return f'sky-gcs-{bucket_name}' - elif self is Rclone.RcloneStores.IBM: - return f'sky-ibm-{bucket_name}' - else: - with ux_utils.print_exception_no_traceback(): - raise NotImplementedError( - f'Unsupported store type for Rclone: {self}') + profile_prefix = { + Rclone.RcloneStores.S3: 'sky-s3', + Rclone.RcloneStores.GCS: 'sky-gcs', + Rclone.RcloneStores.IBM: 'sky-ibm', + } + return f'{profile_prefix[self]}-{bucket_name}' def get_config(self, bucket_name: Optional[str] = None, rclone_profile_name: Optional[str] = None, region: Optional[str] = None) -> str: - """ + """Generates an Rclone configuration for a specific storage type. + + This method creates an Rclone configuration string based on the storage + type and the provided parameters. + + Args: + bucket_name: The name of the bucket. + rclone_profile_name: The name of the Rclone profile. If not + provided, it will be generated using the bucket_name. + region: Region of bucket. + + Returns: + A string containing the Rclone configuration. + + Raises: + NotImplementedError: If the storage type is not supported. """ if rclone_profile_name is None: assert bucket_name is not None From 3d162557bdfe3240151198b7fa11596a010e45db Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sat, 6 Jul 2024 23:02:19 +0000 Subject: [PATCH 37/42] update MOUNT_CACHE mode to MOUNT_CACHED mode --- sky/backends/cloud_vm_ray_backend.py | 10 +++++----- sky/data/mounting_utils.py | 4 ++-- sky/data/storage.py | 26 +++++++++++--------------- sky/task.py | 2 +- 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 3f79dd91caa..dcf20796568 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4559,11 +4559,11 @@ def _execute_storage_mounts( path: storage_mount for path, storage_mount in storage_mounts.items() if (storage_mount.mode == storage_lib.StorageMode.MOUNT or - storage_mount.mode == storage_lib.StorageMode.MOUNT_CACHE) + storage_mount.mode == storage_lib.StorageMode.MOUNT_CACHED) } # Handle cases when there aren't any Storages with either MOUNT or - # MOUNT_CACHE mode. + # MOUNT_CACHED mode. if not storage_mounts: return @@ -4594,9 +4594,9 @@ def _execute_storage_mounts( mount_cmd = store.mount_command(dst) action_message = 'Mounting' else: - assert storage_obj.mode == storage_lib.StorageMode.MOUNT_CACHE - mount_cmd = store.mount_cache_command(dst) - action_message = 'Mounting cache mode' + assert storage_obj.mode == storage_lib.StorageMode.MOUNT_CACHED + mount_cmd = store.mount_cached_command(dst) + action_message = 'Mounting cached mode' src_print = (storage_obj.source if storage_obj.source else storage_obj.name) if isinstance(src_print, list): diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index a75009d09db..50e0641be7b 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -102,8 +102,8 @@ def get_cos_mount_cmd(rclone_config: str, rclone_profile_name: str, return mount_cmd -def get_mount_cache_cmd(rclone_config: str, rclone_profile_name: str, - bucket_name: str, mount_path: str) -> str: +def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str, + bucket_name: str, mount_path: str) -> str: """Returns a command to mount a GCP/AWS bucket using rclone.""" # stores bucket profile in rclone config file at the remote nodes. configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; ' diff --git a/sky/data/storage.py b/sky/data/storage.py index 1560e1a6a18..2388a3ed9ea 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -164,7 +164,7 @@ def store_prefix(self) -> str: class StorageMode(enum.Enum): MOUNT = 'MOUNT' COPY = 'COPY' - MOUNT_CACHE = 'MOUNT_CACHE' + MOUNT_CACHED = 'MOUNT_CACHED' class AbstractStore: @@ -319,10 +319,10 @@ def mount_command(self, mount_path: str) -> str: """ raise NotImplementedError - def mount_cache_command(self, mount_path: str) -> str: + def mount_cached_command(self, mount_path: str) -> str: """Returns the command to mount the Store to the specified mount_path. - This command is used for MOUNT_CACHE mode. Includes the setup commands + This command is used for MOUNT_CACHED mode. Includes the setup commands to install mounting tools. Args: @@ -1367,18 +1367,16 @@ def mount_command(self, mount_path: str) -> str: return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd) - def mount_cache_command(self, mount_path: str) -> str: + def mount_cached_command(self, mount_path: str) -> str: install_cmd = mounting_utils.get_rclone_install_cmd() rclone_profile_name = ( data_utils.Rclone.RcloneStores.S3.get_profile_name(self.name)) rclone_config = data_utils.Rclone.RcloneStores.S3.get_config( rclone_profile_name=rclone_profile_name) - mount_cmd = mounting_utils.get_mount_cache_cmd(rclone_config, - rclone_profile_name, - self.bucket.name, - mount_path) + mount_cached_cmd = mounting_utils.get_mount_cached_cmd( + rclone_config, rclone_profile_name, self.bucket.name, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, - mount_cmd) + mount_cached_cmd) def _create_s3_bucket(self, bucket_name: str, @@ -1820,18 +1818,16 @@ def mount_command(self, mount_path: str) -> str: return mounting_utils.get_mounting_command(mount_path, install_cmd, mount_cmd, version_check_cmd) - def mount_cache_command(self, mount_path: str) -> str: + def mount_cached_command(self, mount_path: str) -> str: install_cmd = mounting_utils.get_rclone_install_cmd() rclone_profile_name = data_utils.Rclone.RcloneStores.GCS.get_profile_name( self.name) rclone_config = data_utils.Rclone.RcloneStores.GCS.get_config( rclone_profile_name=rclone_profile_name) - mount_cmd = mounting_utils.get_mount_cache_cmd(rclone_config, - rclone_profile_name, - self.bucket.name, - mount_path) + mount_cached_cmd = mounting_utils.get_mount_cached_cmd( + rclone_config, rclone_profile_name, self.bucket.name, mount_path) return mounting_utils.get_mounting_command(mount_path, install_cmd, - mount_cmd) + mount_cached_cmd) def _download_file(self, remote_path: str, local_path: str) -> None: """Downloads file from remote to local on GS bucket diff --git a/sky/task.py b/sky/task.py index c196b95ea73..fd28e9ff9af 100644 --- a/sky/task.py +++ b/sky/task.py @@ -1113,7 +1113,7 @@ def get_required_cloud_features( # Storage mounting for _, storage_mount in self.storage_mounts.items(): if (storage_mount.mode == storage_lib.StorageMode.MOUNT or - storage_mount.mode == storage_lib.StorageMode.MOUNT_CACHE): + storage_mount.mode == storage_lib.StorageMode.MOUNT_CACHED): required_features.add( clouds.CloudImplementationFeatures.STORAGE_MOUNTING) break From 3818a05d407126d00857eed0f14a3b05817c6aae Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 7 Jul 2024 00:30:06 +0000 Subject: [PATCH 38/42] update smoke test --- tests/test_smoke.py | 98 ++++++------------- .../test_yamls/test_storage_mounting.yaml.j2 | 23 +++++ 2 files changed, 53 insertions(+), 68 deletions(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b6038235699..46e54969888 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -970,14 +970,23 @@ def test_using_file_mounts_with_env_vars(generic_cloud: str): def test_aws_storage_mounts_with_stop(): name = _get_cluster_name() storage_name = f'sky-test-{int(time.time())}' + rclone_profile_name = data_utils.Rclone.RcloneStores.S3.get_profile_name( + storage_name) + + # Reading and rendering the template template_str = pathlib.Path( 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) + content = template.render(storage_name=storage_name, + include_mount_cached=True) + + # Creating a temporary YAML file with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: f.write(content) f.flush() file_path = f.name + + # List of test commands test_commands = [ *storage_setup_commands, f'sky launch -y -c {name} --cloud aws {file_path}', @@ -985,10 +994,12 @@ def test_aws_storage_mounts_with_stop(): f'aws s3 ls {storage_name}/hello.txt', f'sky stop -y {name}', f'sky start -y {name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' + # Check if hello.txt exists after restart at the mount point + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt; ' + f'rclone ls {rclone_profile_name}:{storage_name}/hello.txt;"', ] + + # Creating and running the test test = Test( 'aws_storage_mounts', test_commands, @@ -998,48 +1009,27 @@ def test_aws_storage_mounts_with_stop(): run_one_test(test) -@pytest.mark.aws -def test_aws_mount_rclone(): - name = _get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - rclone_profile_name = data_utils.Rclone.RcloneStores.S3.get_profile_name( - storage_name) - template_str = pathlib.Path( - 'tests/test_yamls/test_rclone_mount.yaml').read_text() - template = jinja2.Template(template_str) - content = template.render(store_type=f'{storage_lib.StoreType.S3.value}', - storage_name=storage_name, - rclone_profile_name=rclone_profile_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *storage_setup_commands, - f'sky launch -y -c {name} --cloud aws {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - ] - test = Test( - 'aws_mount_rclone', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - @pytest.mark.gcp def test_gcp_storage_mounts_with_stop(): name = _get_cluster_name() storage_name = f'sky-test-{int(time.time())}' + rclone_profile_name = data_utils.Rclone.RcloneStores.GCS.get_profile_name( + storage_name) + + # Reading and rendering the template template_str = pathlib.Path( 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() template = jinja2.Template(template_str) - content = template.render(storage_name=storage_name) + content = template.render(storage_name=storage_name, + include_mount_cached=True) + + # Creating a temporary YAML file with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: f.write(content) f.flush() file_path = f.name + + # List of test commands test_commands = [ *storage_setup_commands, f'sky launch -y -c {name} --cloud gcp {file_path}', @@ -1047,10 +1037,12 @@ def test_gcp_storage_mounts_with_stop(): f'gsutil ls gs://{storage_name}/hello.txt', f'sky stop -y {name}', f'sky start -y {name}', - # Check if hello.txt from mounting bucket exists after restart in - # the mounted directory - f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"' + # Check if hello.txt exists after restart at the mount point + f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt; ' + f'rclone ls {rclone_profile_name}:{storage_name}/hello.txt;"', ] + + # Creating and running the test test = Test( 'gcp_storage_mounts', test_commands, @@ -1060,36 +1052,6 @@ def test_gcp_storage_mounts_with_stop(): run_one_test(test) -@pytest.mark.gcp -def test_gcp_mount_rclone(): - name = _get_cluster_name() - storage_name = f'sky-test-{int(time.time())}' - rclone_profile_name = data_utils.Rclone.RcloneStores.GCS.get_profile_name( - storage_name) - template_str = pathlib.Path( - 'tests/test_yamls/test_rclone_mount.yaml').read_text() - template = jinja2.Template(template_str) - content = template.render(store_type=storage_lib.StoreType.GCS.value, - storage_name=storage_name, - rclone_profile_name=rclone_profile_name) - with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: - f.write(content) - f.flush() - file_path = f.name - test_commands = [ - *storage_setup_commands, - f'sky launch -y -c {name} --cloud gcp {file_path}', - f'sky logs {name} 1 --status', # Ensure job succeeded. - ] - test = Test( - 'gcp_mount_rclone', - test_commands, - f'sky down -y {name}; sky storage delete -y {storage_name}', - timeout=20 * 60, # 20 mins - ) - run_one_test(test) - - @pytest.mark.kubernetes def test_kubernetes_storage_mounts(): # Tests bucket mounting on k8s, assuming S3 is configured. diff --git a/tests/test_yamls/test_storage_mounting.yaml.j2 b/tests/test_yamls/test_storage_mounting.yaml.j2 index 37a46829bd6..8dac453fe43 100644 --- a/tests/test_yamls/test_storage_mounting.yaml.j2 +++ b/tests/test_yamls/test_storage_mounting.yaml.j2 @@ -27,6 +27,14 @@ file_mounts: source: ~/tmp-workdir mode: MOUNT + # Mounting private buckets in MOUNT_CACHED mode + {% if include_mount_cached | default(False) %} + /mount_private_mount_cached: + name: {{storage_name}}_mount_cached + source: ~/tmp-workdir + mode: MOUNT_CACHED + {% endif %} + run: | set -ex @@ -48,3 +56,18 @@ run: | # Write to private bucket in MOUNT mode should pass echo "hello" > /mount_private_mount/hello.txt + + # Write to private bucket in MOUNT_CACHED mode should pass + echo "hello" > /mount_private_mount_cached/hello.txt + + {% if include_mount_cached | default(False) %} + # Check private bucket contents + ls -ltr /mount_private_mount_cached/foo + ls -ltr /mount_private_mount_cached/tmp\ file + + # Symlinks are not copied to buckets + ! ls /mount_private_mount_cached/circle-link + + # Write to private bucket in MOUNT_CACHED mode should pass + echo "hello" > /mount_private_mount_cached/hello.txt + {% endif %} From 27e702bdd8a5c8ef82a2b6e80183cd4804d1790f Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Sun, 7 Jul 2024 02:45:16 +0000 Subject: [PATCH 39/42] additional comments for mount cached command explaining options for rclone vfs mode --- sky/data/mounting_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index 26806ff991a..c3c416f6fe8 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -110,7 +110,6 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str, f'mkdir -p {constants.RCLONE_CONFIG_DIR} && ' f'echo "{rclone_config}" >> ' f'{constants.RCLONE_CONFIG_PATH}') - # --daemon will keep the mounting process running in the background. # TODO(Doyoung): remove rclone log related scripts and options when done with implementation. log_dir_path = os.path.expanduser('~/.sky/rclone_log') log_file_path = os.path.join(log_dir_path, f'{bucket_name}.log') @@ -123,10 +122,17 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str, f'{configure_rclone_profile} && ' 'rclone mount ' f'{rclone_profile_name}:{bucket_name} {mount_path} ' + # '--daemon' keeps the mounting process running in the background. '--daemon --daemon-wait 0 ' # need to update the log fiel so it grabs the home directory from the remote instance. #f'--log-file {log_file_path} --log-level DEBUG ' #log related flags + # '--dir-cache-time' specifies the frequency of how often rclone should + # check the backend storage for an update when there is a discrepancy. '--allow-other --vfs-cache-mode full --dir-cache-time 30s ' + # '--transfers 1' guarantees the files written at the local mount point + # to be uploaded to the backend storage in the order of creation. + # '--vfs-cache-poll-interval' specifies the frequency of how often + # rclone checks the local mount point to upload newly written files. '--transfers 1 --vfs-cache-poll-interval 5s') return mount_cmd From 061253242927d24db8618236342099314b06fe18 Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 8 Jul 2024 01:51:59 +0000 Subject: [PATCH 40/42] rclone class doc-string fix --- sky/data/data_utils.py | 52 ++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 376b4fd1bea..4ef24792b67 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -390,15 +390,8 @@ def get_cos_regions() -> List[str]: class Rclone: - """Static class implementing common utilities of rclone without rclone sdk. + """Provides methods to manage and generate Rclone configuration profile.""" - Storage providers supported by rclone are required to: - - list their rclone profile prefix in RcloneStores - - implement configuration in get_config() - """ - - # Mapping of storage providers using rclone - # to their respective profile prefix class RcloneStores(enum.Enum): S3 = 'S3' GCS = 'GCS' @@ -487,8 +480,19 @@ def get_config(self, @staticmethod def store_rclone_config(bucket_name: str, cloud: RcloneStores, region: str) -> str: - """Creates a configuration files for rclone - used for - bucket syncing and mounting """ + """Creates rclone configuration files for bucket syncing and mounting. + + Args: + bucket_name: Name of the bucket. + cloud: RcloneStores enum representing the cloud provider. + region: Region of the bucket. + + Returns: + str: The configuration data written to the file. + + Raises: + StorageError: If rclone is not installed. + """ rclone_config_path = os.path.expanduser(constants.RCLONE_CONFIG_PATH) config_data = cloud.get_config(bucket_name=bucket_name, region=region) try: @@ -526,8 +530,15 @@ def store_rclone_config(bucket_name: str, cloud: RcloneStores, @staticmethod def get_region_from_rclone(bucket_name: str, cloud: RcloneStores) -> str: - """Returns region field of the specified bucket in rclone.conf - if bucket exists, else empty string""" + """Returns the region field of the specified bucket in rclone.conf. + + Args: + bucket_name: Name of the bucket. + cloud: RcloneStores enum representing the cloud provider. + + Returns: + The region field if the bucket exists, otherwise an empty string. + """ rclone_profile = cloud.get_profile_name(bucket_name) rclone_config_path = os.path.expanduser(constants.RCLONE_CONFIG_PATH) with open(rclone_config_path, 'r', encoding='utf-8') as file: @@ -548,7 +559,12 @@ def get_region_from_rclone(bucket_name: str, cloud: RcloneStores) -> str: @staticmethod def delete_rclone_bucket_profile(bucket_name: str, cloud: RcloneStores): - """Deletes specified bucket profile for rclone.conf""" + """Deletes specified bucket profile from rclone.conf. + + Args: + bucket_name: Name of the bucket. + cloud: RcloneStores enum representing the cloud provider. + """ rclone_profile = cloud.get_profile_name(bucket_name) rclone_config_path = os.path.expanduser(constants.RCLONE_CONFIG_PATH) @@ -568,8 +584,14 @@ def delete_rclone_bucket_profile(bucket_name: str, cloud: RcloneStores): @staticmethod def _remove_bucket_profile_rclone(bucket_name: str, cloud: RcloneStores) -> List[str]: - """Returns rclone profiles without profiles matching - [profile_prefix+bucket_name] + """Returns rclone profiles without profiles matching [profile_prefix+bucket_name]. + + Args: + bucket_name: Name of the bucket. + cloud: RcloneStores enum representing the cloud provider. + + Returns: + Lines to keep in the rclone config file. """ rclone_profile_name = cloud.get_profile_name(bucket_name) rclone_config_path = os.path.expanduser(constants.RCLONE_CONFIG_PATH) From ea290b85fac104a7ad2647f7b7fe10ff11b6acbb Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 8 Jul 2024 01:52:22 +0000 Subject: [PATCH 41/42] nit format --- sky/data/mounting_utils.py | 10 +++++++--- tests/test_smoke.py | 14 +++++++------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index c3c416f6fe8..0d2c3f0cc1e 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -104,7 +104,11 @@ def get_cos_mount_cmd(rclone_config: str, rclone_profile_name: str, def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str, bucket_name: str, mount_path: str) -> str: - """Returns a command to mount a GCP/AWS bucket using rclone.""" + """Returns a command to mount a bucket using rclone with vfs cache. + + + + """ # stores bucket profile in rclone config file at the remote nodes. configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; ' f'mkdir -p {constants.RCLONE_CONFIG_DIR} && ' @@ -124,11 +128,11 @@ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str, f'{rclone_profile_name}:{bucket_name} {mount_path} ' # '--daemon' keeps the mounting process running in the background. '--daemon --daemon-wait 0 ' - # need to update the log fiel so it grabs the home directory from the remote instance. + # need to update the log file so it grabs the home directory from the remote instance. #f'--log-file {log_file_path} --log-level DEBUG ' #log related flags # '--dir-cache-time' specifies the frequency of how often rclone should # check the backend storage for an update when there is a discrepancy. - '--allow-other --vfs-cache-mode full --dir-cache-time 30s ' + '--allow-other --vfs-cache-mode writes --dir-cache-time 30s ' # '--transfers 1' guarantees the files written at the local mount point # to be uploaded to the backend storage in the order of creation. # '--vfs-cache-poll-interval' specifies the frequency of how often diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8370cad3a68..23aa9718d4f 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1020,13 +1020,13 @@ def test_aws_storage_mounts_with_stop(): template = jinja2.Template(template_str) content = template.render(storage_name=storage_name, include_mount_cached=True) - + # Creating a temporary YAML file with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: f.write(content) f.flush() file_path = f.name - + # List of test commands test_commands = [ *storage_setup_commands, @@ -1039,7 +1039,7 @@ def test_aws_storage_mounts_with_stop(): f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt; ' f'rclone ls {rclone_profile_name}:{storage_name}/hello.txt;"', ] - + # Creating and running the test test = Test( 'aws_storage_mounts', @@ -1056,20 +1056,20 @@ def test_gcp_storage_mounts_with_stop(): storage_name = f'sky-test-{int(time.time())}' rclone_profile_name = data_utils.Rclone.RcloneStores.GCS.get_profile_name( storage_name) - + # Reading and rendering the template template_str = pathlib.Path( 'tests/test_yamls/test_storage_mounting.yaml.j2').read_text() template = jinja2.Template(template_str) content = template.render(storage_name=storage_name, include_mount_cached=True) - + # Creating a temporary YAML file with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f: f.write(content) f.flush() file_path = f.name - + # List of test commands test_commands = [ *storage_setup_commands, @@ -1082,7 +1082,7 @@ def test_gcp_storage_mounts_with_stop(): f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt; ' f'rclone ls {rclone_profile_name}:{storage_name}/hello.txt;"', ] - + # Creating and running the test test = Test( 'gcp_storage_mounts', From 64d9a7d9d4a722f38a87f6eef1b86d60d3f2ed6c Mon Sep 17 00:00:00 2001 From: Doyoung Kim Date: Mon, 8 Jul 2024 01:52:48 +0000 Subject: [PATCH 42/42] update step 7 of maybe_translate_local_file_mounts_and_sync_up --- sky/utils/controller_utils.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 477ebe8d1ba..0af4f6c0432 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -749,19 +749,20 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task', # it was handled in step 6. updated_mount_storages = {} for storage_path, storage_obj in task.storage_mounts.items(): - if (storage_obj.mode == storage_lib.StorageMode.MOUNT and - not storage_obj.source): - # Construct source URL with first store type and storage name - # E.g., s3://my-storage-name - source = list( - storage_obj.stores.keys())[0].store_prefix() + storage_obj.name - new_storage = storage_lib.Storage.from_yaml_config({ - 'source': source, - 'persistent': storage_obj.persistent, - 'mode': storage_lib.StorageMode.MOUNT.value, - # We enable force delete to allow the controller to delete - # the object store in case persistent is set to False. - '_force_delete': True - }) - updated_mount_storages[storage_path] = new_storage + if (storage_obj.mode == storage_lib.StorageMode.MOUNT or + storage_obj.mode == storage_lib.StorageMode.MOUNT_CACHED): + if storage_obj.source is None: + # Construct source URL with first store type and storage name + # E.g., s3://my-storage-name + source = list( + storage_obj.stores.keys())[0].store_prefix() + storage_obj.name + new_storage = storage_lib.Storage.from_yaml_config({ + 'source': source, + 'persistent': storage_obj.persistent, + 'mode': storage_obj.mode.value, + # We enable force delete to allow the controller to delete + # the object store in case persistent is set to False. + '_force_delete': True + }) + updated_mount_storages[storage_path] = new_storage task.update_storage_mounts(updated_mount_storages)