diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md
index e6a213283d8..bc9893fec5b 100644
--- a/llm/gpt-2/README.md
+++ b/llm/gpt-2/README.md
@@ -1,7 +1,7 @@
 # Run GPT-2 in llm.c on any cloud with SkyPilot
 
-This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481)
-With SkyPilot, you can run GPT-2 (124M) training on any cloud.
+This is a reproducible package of llm.c's GPT-2 (124M) training by @karpathy (https://github.com/karpathy/llm.c/discussions/481).
+With SkyPilot, you can run GPT-2 (124M) training on any cloud. SkyPilot looks for the cheapest resources available on the clouds enabled for a user, launches and manages the whole data processing and training pipeline, leading to a close to ~\$20 target cost as @karpathy mentioned in the discussion.
 
 ## Prerequisites
 
@@ -30,13 +30,21 @@ sky launch -c gpt2 gpt2.yaml
 
 ![GPT-2 training with 8 A100 GPUs](https://imgur.com/v8SGpsF.png)
 
-Or, you can train the model with a single A100, by adding `--gpu A100`:
+Or, you can train the model with a single A100, by adding `--gpus A100`:
 ```bash
-sky launch -c gpt2 gpt2.yaml --gpu A100
+sky launch -c gpt2 gpt2.yaml --gpus A100
 ```
 
 ![GPT-2 training with a single A100](https://imgur.com/hN65g4r.png)
 
+
+It is also possible to speed up the training of the model on 8 H100 (2.3x more tok/s than 8x A100s):
+```bash
+sky launch -c gpt2 gpt2.yaml --gpus H100:8
+```
+
+![GPT-2 training with 8 H100](https://imgur.com/STbi80b.png)
+
 ### Download logs and visualizations
 
 After the training is finished, you can download the logs and visualizations with the following command:
@@ -85,9 +93,9 @@ After the data is processed, you can then train the model on a GPU VM with 8 A10
 sky launch -c gpt2-train --detach-setup gpt2-train.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
-Or, you can train the model with a single A100, by adding `--gpu A100`:
+Or, you can train the model with a single A100, by adding `--gpus A100`:
 ```bash
-sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_NAME=your-bucket-name
+sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpus A100 --env BUCKET_NAME=your-bucket-name
 ```
 
 
@@ -95,12 +103,14 @@ sky launch -c gpt2-train --detach-setup gpt2-train.yaml --gpu A100 --env BUCKET_
 
 We can also combine the two steps into a single SkyPilot job, and let SkyPilot to handle the dependencies between the two steps. Here is an example of how to do this (replace `your-bucket-name` with your bucket name):
 ```bash
-cat gpt2-data.yaml > gpt2-pipeline.yaml
-echo "---" >> gpt2-pipeline.yaml
-cat gpt2-train.yaml >> gpt2-pipeline.yaml
 sky jobs launch -n gpt2 gpt2-pipeline.yaml --env BUCKET_NAME=your-bucket-name
 ```
 
+> Note: the pipeline yaml can be retrieved with the following command:
+```bash
+cat gpt2-data.yaml > gpt2-pipeline.yaml; echo "---" >> gpt2-pipeline.yaml; cat gpt2-train.yaml >> gpt2-pipeline.yaml
+```
+
 SkyPilot will first download and process the dataset on a CPU VM and store the
 processed data in a GCS bucket. Then, it will launch a GPT-2 training job on a
 GPU VM. The training job will train GPT-2 (124M) on the processed data.
diff --git a/llm/gpt-2/gpt2-data.yaml b/llm/gpt-2/gpt2-data.yaml
index 8346e37ccb6..fc7bb02bf95 100644
--- a/llm/gpt-2/gpt2-data.yaml
+++ b/llm/gpt-2/gpt2-data.yaml
@@ -1,7 +1,8 @@
 name: gpt2-data
 
 envs:
-  BUCKET_NAME: # Fill in your bucket name
+  BUCKET_NAME: # TODO: Fill in your bucket name
+  BUCKET_STORE: s3 # Can be s3, gcs, or r2.
 
 resources:
   cpus: 8+
@@ -9,6 +10,7 @@ resources:
 file_mounts:
   /cache:
     name: $BUCKET_NAME
+    store: $BUCKET_STORE
     mode: MOUNT
 
 setup: |
diff --git a/llm/gpt-2/gpt2-pipeline.yaml b/llm/gpt-2/gpt2-pipeline.yaml
new file mode 100644
index 00000000000..e5ea05f7948
--- /dev/null
+++ b/llm/gpt-2/gpt2-pipeline.yaml
@@ -0,0 +1,129 @@
+name: gpt2-data
+
+envs:
+  BUCKET_NAME: # TODO: Fill in your bucket name
+  BUCKET_STORE: s3 # Can be s3, gcs, or r2.
+
+resources:
+  cpus: 8+
+
+file_mounts:
+  /cache:
+    name: $BUCKET_NAME
+    store: $BUCKET_STORE
+    mode: MOUNT
+
+setup: |
+  pip install tqdm tiktoken requests datasets
+  git clone https://github.com/karpathy/llm.c.git@ed37d9261ba13ef212c01e2de8b309cbb46a2aa7 || true
+
+  # Adding revision to fix the dataset version, as the latest fineweb
+  # dataset removed the samples, causing error:
+  #   Please pass `features` or at least one example when writing data
+  sed -i 's/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train")/fw = load_dataset("HuggingFaceFW\/fineweb", name=remote_name, split="train", revision="9767af12bf8f0f7d3c91e0345b89bc6b9cbe1a94")/' dev/data/fineweb.py
+
+
+run: |
+  cd llm.c
+  # tokenize the FineWeb dataset 10B tokens sample (takes ~1 hour, get lunch?)
+  # writes ~19GB of raw GPT-2 tokens to dev/data/fineweb10B
+  # and ~46GB in ~/.cache/huggingface/datasets/HuggingFaceFW___fineweb
+  python dev/data/fineweb.py --version 10B
+
+  rsync -Pavz --exclude "datasets/downloads/" ~/.cache/huggingface /cache/
+  rsync -Pavz dev/data/fineweb10B /cache/
+---
+name: gpt2-train
+
+envs:
+  BUCKET_NAME: # TODO: Fill in your bucket name
+  BUCKET_STORE: s3 # Can be s3, gcs, or r2.
+
+resources:
+  accelerators: A100:8
+  # Use docker image for latest version g++ to enable the compilation of llm.c.
+  image_id: docker:nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+  any_of:
+    # Avoid using docker image for lambda due to the docker is not supported on
+    # Lambda yet, but the base image works.
+    - cloud: lambda
+      image_id: null
+    - cloud: aws
+    - cloud: gcp
+    - cloud: azure
+    - cloud: fluidstack
+    - cloud: kubernetes
+  
+file_mounts:
+  ~/.cache/huggingface:
+    name: $BUCKET_NAME
+    store: $BUCKET_STORE
+    mode: COPY
+
+setup: |
+  cd ~
+
+  # install cudnn so we can use FlashAttention and run fast (optional)
+  # https://developer.nvidia.com/cudnn-downloads
+  # for me, CUDA 12 (run `nvcc --version`) running on Linux x86_64 Ubuntu 22.04
+  if [ -f ./CUDNN_INSTALLED ]; then
+    echo "cudnn already installed"
+  else
+    system=$(lsb_release -si | tr '[:upper:]' '[:lower:]')
+    # Get version and remove the dot
+    version=$(lsb_release -sr | tr -d .)
+    export system_version="${system}${version}"
+    wget https://developer.download.nvidia.com/compute/cudnn/9.1.1/local_installers/cudnn-local-repo-${system_version}-9.1.1_1.0-1_amd64.deb -O cudnn-installer.deb
+    sudo dpkg -i cudnn-installer.deb
+    sudo cp /var/cudnn-local-repo-${system_version}-9.1.1/cudnn-*-keyring.gpg /usr/share/keyrings/
+    # Remove problematic kubernetes.list source
+    sudo apt-get update --allow-releaseinfo-change || true
+
+    sudo apt-get -y install cudnn-cuda-12
+
+    touch ./CUDNN_INSTALLED
+  fi
+
+  # "install" cudnn-frontend to ~/
+  sudo apt -y install git
+  git clone https://github.com/NVIDIA/cudnn-frontend.git || true
+
+  # install MPI (optional, if you intend to use multiple GPUs)
+  # SkyPilot do not install MPI as that requires NCCL which needs to be manually
+  # installed.
+  sudo apt install -y openmpi-bin openmpi-doc libopenmpi-dev
+  # install nccl
+  pip install nvidia-nccl-cu12
+  export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/nccl2/lib
+  export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/nccl2/include
+
+  git clone https://github.com/karpathy/llm.c.git || true
+  cd llm.c
+  ln -s ~/.cache/huggingface/fineweb10B dev/data/
+  # compile llm.c (mixed precision, with cuDNN flash-attention)
+  # first compilation is ~1 minute, mostly due to cuDNN
+  make train_gpt2cu USE_CUDNN=1
+
+
+run: |
+  cd ~/llm.c
+  # train on multiple GPUs
+  mpirun -np $SKYPILOT_NUM_GPUS_PER_NODE --allow-run-as-root ./train_gpt2cu \
+      -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+      -j "dev/data/fineweb10B/fineweb_val_*.bin" \
+      -o log124M \
+      -e "d12" \
+      -b 64 -t 1024 \
+      -d 524288 \
+      -r 1 \
+      -z 1 \
+      -c 0.1 \
+      -l 0.0006 \
+      -q 0.0 \
+      -u 700 \
+      -n 5000 \
+      -v 250 -s 20000 \
+      -h 1
+
+  # Upload the log and model to the bucket
+  rsync -Pavz log124M ~/.cache/huggingface
diff --git a/llm/gpt-2/gpt2-train.yaml b/llm/gpt-2/gpt2-train.yaml
index e907d28d781..3a4e8c28d14 100644
--- a/llm/gpt-2/gpt2-train.yaml
+++ b/llm/gpt-2/gpt2-train.yaml
@@ -1,7 +1,8 @@
-name: train
+name: gpt2-train
 
 envs:
-  BUCKET_NAME: # Fill in your bucket name
+  BUCKET_NAME: # TODO: Fill in your bucket name
+  BUCKET_STORE: s3 # Can be s3, gcs, or r2.
 
 resources:
   accelerators: A100:8
@@ -21,11 +22,11 @@ resources:
 file_mounts:
   ~/.cache/huggingface:
     name: $BUCKET_NAME
+    store: $BUCKET_STORE
     mode: COPY
 
 setup: |
   cd ~
-  pip install tqdm tiktoken requests datasets
 
   # install cudnn so we can use FlashAttention and run fast (optional)
   # https://developer.nvidia.com/cudnn-downloads
diff --git a/sky/cli.py b/sky/cli.py
index 23c5ba7a3cd..db5291d949c 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -772,6 +772,8 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
     else:
         task = sky.Task(name='sky-cmd', run=entrypoint)
         task.set_resources({sky.Resources()})
+        # env update has been done for DAG in load_chain_dag_from_yaml for YAML.
+        task.update_envs(env)
 
     # Override.
     if workdir is not None:
@@ -787,7 +789,6 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
         task.num_nodes = num_nodes
     if name is not None:
         task.name = name
-    task.update_envs(env)
     return task
 
 
diff --git a/sky/utils/dag_utils.py b/sky/utils/dag_utils.py
index ef80bfd2a17..7a4fe90e7fb 100644
--- a/sky/utils/dag_utils.py
+++ b/sky/utils/dag_utils.py
@@ -70,9 +70,9 @@ def load_chain_dag_from_yaml(
     Has special handling for an initial section in YAML that contains only the
     'name' field, which is the DAG name.
 
-    'env_overrides' is in effect only when there's exactly one task. It is a
-    list of (key, value) pairs that will be used to update the task's 'envs'
-    section.
+    'env_overrides' is a list of (key, value) pairs that will be used to update
+    the task's 'envs' section. If it is a chain dag, the envs will be updated
+    for all tasks in the chain.
 
     Returns:
       A chain Dag with 1 or more tasks (an empty entrypoint would create a
@@ -90,12 +90,6 @@ def load_chain_dag_from_yaml(
         # YAML has only `name: xxx`. Still instantiate a task.
         configs = [{'name': dag_name}]
 
-    if len(configs) > 1:
-        # TODO(zongheng): in a chain DAG of N tasks, cli.py currently makes the
-        # decision to not apply overrides. Here we maintain this behavior. We
-        # can listen to user feedback to change this.
-        env_overrides = None
-
     current_task = None
     with dag_lib.Dag() as dag:
         for task_config in configs: