allenai · epwalsh · Oct 10, 2023 · Sep 28, 2023 · Sep 28, 2023 · Sep 28, 2023
diff --git a/configs/mcli/v1-mix-medium-mitch-ish.yaml b/configs/mcli/v1-mix-medium-mitch-ish.yaml
@@ -0,0 +1,31 @@
+run_name: v1-mix-medium-mitch-ish
+image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+gpu_num: 216
+cluster: r12z3
+gpu_type: a100_40gb
+integrations:
+  - integration_type: git_repo
+    git_repo: allenai/LLM
+    git_branch: main  # make sure to update this!
+    pip_install: -e .
+    ssh_clone: true
+command: |-
+  pip freeze
+  mkdir -p /root/.cache/torch/
+
+  export OMP_NUM_THREADS=8
+  export LOG_FILTER_TYPE=local_rank0_only
+  export OLMO_NO_SSL=1  # we get SSLErrors all the time on this cluster
+  #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+
+  cd LLM
+
+  torchrun \
+  --master_addr $MASTER_ADDR \
+  --master_port $MASTER_PORT \
+  --nnodes $NUM_NODES \
+  --node_rank $NODE_RANK \
+  --nproc_per_node 8 \
+  scripts/train.py configs/v1-mix-medium-mitch-ish-mcli.yaml \
+    --run_name=v1-mix-mitch-ish \
+    --global_train_batch_size=2160
diff --git a/configs/mcli/v1-mix-medium.yaml b/configs/mcli/v1-mix-medium.yaml
@@ -7,12 +7,19 @@ integrations:
   - integration_type: git_repo
     git_repo: allenai/LLM
     git_branch: main  # make sure to update this!
-    pip_install: -e .[all]
+    pip_install: -e .
     ssh_clone: true
 command: |-
+  pip freeze
+  mkdir -p /root/.cache/torch/
+
   export OMP_NUM_THREADS=8
+  export LOG_FILTER_TYPE=local_rank0_only
+  export OLMO_NO_SSL=1  # we get SSLErrors all the time on this cluster
   #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+
   cd LLM
+
   torchrun \
   --master_addr $MASTER_ADDR \
   --master_port $MASTER_PORT \

diff --git a/configs/mcli/v1_5-mix-medium-mitch-ish.yaml b/configs/mcli/v1_5-mix-medium-mitch-ish.yaml
@@ -0,0 +1,31 @@
+run_name: v1-5-mix-medium-mitch-ish  # can't have "_" or "." here
+image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+gpu_num: 216
+cluster: r12z3
+gpu_type: a100_40gb
+integrations:
+  - integration_type: git_repo
+    git_repo: allenai/LLM
+    git_branch: main  # make sure to update this!
+    pip_install: -e .
+    ssh_clone: true
+command: |-
+  pip freeze
+  mkdir -p /root/.cache/torch/
+
+  export OMP_NUM_THREADS=8
+  export LOG_FILTER_TYPE=local_rank0_only
+  export OLMO_NO_SSL=1
+  #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+
+  cd LLM
+
+  torchrun \
+  --master_addr $MASTER_ADDR \
+  --master_port $MASTER_PORT \
+  --nnodes $NUM_NODES \
+  --node_rank $NODE_RANK \
+  --nproc_per_node 8 \
+  scripts/train.py configs/v1_5-mix-medium-mitch-ish-mcli.yaml \
+    --run_name=v1_5-mix-mitch-ish \
+    --global_train_batch_size=2160
diff --git a/configs/mcli/v1_5-mix-medium.yaml b/configs/mcli/v1_5-mix-medium.yaml
@@ -7,12 +7,19 @@ integrations:
   - integration_type: git_repo
     git_repo: allenai/LLM
     git_branch: main  # make sure to update this!
-    pip_install: -e .[all]
+    pip_install: -e .
     ssh_clone: true
 command: |-
+  pip freeze
+  mkdir -p /root/.cache/torch/
+
   export OMP_NUM_THREADS=8
+  export LOG_FILTER_TYPE=local_rank0_only
+  export OLMO_NO_SSL=1  # we get SSLErrors all the time on this cluster
   #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
+
   cd LLM
+
   torchrun \
   --master_addr $MASTER_ADDR \
   --master_port $MASTER_PORT \