From 995bfbedba0d70176699fc0686f59b7f7950665f Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 5 Apr 2023 15:50:31 -0700 Subject: [PATCH 1/7] Adds a little script that can upload artifacts --- requirements.txt | 3 ++- scripts/upload_artifact.py | 55 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 scripts/upload_artifact.py diff --git a/requirements.txt b/requirements.txt index 90816083c..41b8d0dae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,5 @@ wandb # triton # flash-attn logzio-python-handler -boto3 \ No newline at end of file +boto3 +google-cloud-storage \ No newline at end of file diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py new file mode 100644 index 000000000..4bc685ca5 --- /dev/null +++ b/scripts/upload_artifact.py @@ -0,0 +1,55 @@ +import logging +from pathlib import Path +from typing import Tuple + +import rich.progress +from google.cloud import storage + +import click + +from dolma.util import prepare_cli_environment + + +log = logging.getLogger(__name__) + + +@click.command() +@click.argument( + "wandb_run_path", + type=str, +) +@click.argument( + "files_or_directories", + nargs=-1, + type=click.Path(exists=True, dir_okay=True, path_type=Path), +) +def main( + wandb_run_path: str, + files_or_directories: Tuple[Path], +): + storage_client = storage.Client() + bucket = storage_client.bucket("allennlp-olmo", "ai2-allennlp") + prefix = wandb_run_path.strip("/") + + files_or_directories = [ + (file_or_directory, prefix + "/" + file_or_directory.name) + for file_or_directory in files_or_directories + ] + while len(files_or_directories) > 0: + file_or_directory, key = files_or_directories.pop() + if file_or_directory.is_file(): + blob = bucket.blob(key) + with file_or_directory.open("rb") as f: + f = rich.progress.wrap_file( + f, + description=f"Uploading {file_or_directory} to gs://{bucket.name}/{key}", + total=file_or_directory.stat().st_size) + blob.upload_from_file(f) + elif file_or_directory.is_dir(): + for f in file_or_directory.iterdir(): + files_or_directories.append((f, key + "/" + f.name)) + + +if __name__ == "__main__": + prepare_cli_environment() + main() From 4a7a1063a310ae638817a94d84c67861ea70b2d6 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 5 Apr 2023 16:05:43 -0700 Subject: [PATCH 2/7] Repair from my random experiment with progress bars --- scripts/upload_artifact.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py index 4bc685ca5..8448405a8 100644 --- a/scripts/upload_artifact.py +++ b/scripts/upload_artifact.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import Tuple -import rich.progress from google.cloud import storage import click @@ -39,12 +38,8 @@ def main( file_or_directory, key = files_or_directories.pop() if file_or_directory.is_file(): blob = bucket.blob(key) - with file_or_directory.open("rb") as f: - f = rich.progress.wrap_file( - f, - description=f"Uploading {file_or_directory} to gs://{bucket.name}/{key}", - total=file_or_directory.stat().st_size) - blob.upload_from_file(f) + log.info(f"Uploading {file_or_directory} to gs://{bucket.name}/{key}") + blob.upload_from_filename(file_or_directory) elif file_or_directory.is_dir(): for f in file_or_directory.iterdir(): files_or_directories.append((f, key + "/" + f.name)) From 514f0060687f048d42fdc06b12778a1c73189097 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 5 Apr 2023 16:14:14 -0700 Subject: [PATCH 3/7] Why are progress bars so hard? --- scripts/upload_artifact.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py index 8448405a8..94e31e0d9 100644 --- a/scripts/upload_artifact.py +++ b/scripts/upload_artifact.py @@ -5,6 +5,7 @@ from google.cloud import storage import click +from tqdm import tqdm from dolma.util import prepare_cli_environment @@ -38,8 +39,15 @@ def main( file_or_directory, key = files_or_directories.pop() if file_or_directory.is_file(): blob = bucket.blob(key) - log.info(f"Uploading {file_or_directory} to gs://{bucket.name}/{key}") - blob.upload_from_filename(file_or_directory) + with file_or_directory.open("rb") as f: + with tqdm.wrapattr( + f, + "read", + total=file_or_directory.stat().st_size, + miniters=1, + desc=f"Uploading {file_or_directory} to gs://{bucket.name}/{key}" + ) as f: + blob.upload_from_file(f, file_or_directory) elif file_or_directory.is_dir(): for f in file_or_directory.iterdir(): files_or_directories.append((f, key + "/" + f.name)) From e942e814a41feb7bb0e014bc4ceec69b6d6248fc Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 5 Apr 2023 16:30:40 -0700 Subject: [PATCH 4/7] Productivity through formatting --- scripts/upload_artifact.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py index 94e31e0d9..5c65dd999 100644 --- a/scripts/upload_artifact.py +++ b/scripts/upload_artifact.py @@ -2,14 +2,12 @@ from pathlib import Path from typing import Tuple -from google.cloud import storage - import click +from google.cloud import storage from tqdm import tqdm from dolma.util import prepare_cli_environment - log = logging.getLogger(__name__) @@ -31,12 +29,11 @@ def main( bucket = storage_client.bucket("allennlp-olmo", "ai2-allennlp") prefix = wandb_run_path.strip("/") - files_or_directories = [ - (file_or_directory, prefix + "/" + file_or_directory.name) - for file_or_directory in files_or_directories + files_or_directories_in_a_special_variable_because_mypy_is_lame = [ + (file_or_directory, prefix + "/" + file_or_directory.name) for file_or_directory in files_or_directories ] - while len(files_or_directories) > 0: - file_or_directory, key = files_or_directories.pop() + while len(files_or_directories_in_a_special_variable_because_mypy_is_lame) > 0: + file_or_directory, key = files_or_directories_in_a_special_variable_because_mypy_is_lame.pop() if file_or_directory.is_file(): blob = bucket.blob(key) with file_or_directory.open("rb") as f: @@ -45,12 +42,14 @@ def main( "read", total=file_or_directory.stat().st_size, miniters=1, - desc=f"Uploading {file_or_directory} to gs://{bucket.name}/{key}" + desc=f"Uploading {file_or_directory} to gs://{bucket.name}/{key}", ) as f: blob.upload_from_file(f, file_or_directory) elif file_or_directory.is_dir(): - for f in file_or_directory.iterdir(): - files_or_directories.append((f, key + "/" + f.name)) + for directory_entry in file_or_directory.iterdir(): + files_or_directories_in_a_special_variable_because_mypy_is_lame.append( + (directory_entry, key + "/" + directory_entry.name) + ) if __name__ == "__main__": From 597d906156f9129140c641adffde817390b3fd08 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 5 Apr 2023 16:58:18 -0700 Subject: [PATCH 5/7] Adds a note to the README about how to find models --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 33e44b371..9b0a3afa8 100644 --- a/README.md +++ b/README.md @@ -59,3 +59,12 @@ gantry run \ This may require a reservation on the Infiniband cluster. See the [Beaker documentation](https://beaker-docs.apps.allenai.org/distributed-training.html) for more information on distributed training. + +## Finding official runs + +We keep all of our runs in WandB under [the "ai2-llm" entity](https://wandb.ai/ai2-llm). +We don't store model checkpoints in WandB. Those are in GCS under `gs://allennlp-olmo/`. + +### Highlighted models + + * 300M parameters, ~70B tokens, a starter model that's not completely random: https://wandb.ai/ai2-llm/LLM-scripts/runs/ed5krfk9 \ No newline at end of file From a3b969da7bd2003b259505cdb9ed31909f50a935 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 5 Apr 2023 17:03:30 -0700 Subject: [PATCH 6/7] Adds an entry to the training log --- LOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/LOG.md b/LOG.md index 84ab52b81..9080b2f07 100644 --- a/LOG.md +++ b/LOG.md @@ -9,6 +9,21 @@ That is, within each transformer block we compute `MLP(LN(x)) + Attention(LN(x)) This allows to increase throughput because we can fuse the separate feed-forward and attention input projections into a single linear layer. We also experimented with [fusing the output projections](https://github.com/allenai/LLM/pull/79) into a single linear layer but that didn't help, possibly due to the overhead of concatenating the feed-forward and attention activations together. + +2023-04-02 +---------- + +First training run! We trained a 300M model on about 70B tokens from C4. +The purpose of this model is to give the other LLM teams something in our format that's not completely random, +so they can test their evaluation and inference code. + +This ran on a single node only on AMD's cluster. +On AMD hardware we're still missing Flash Attention, and we could not get `torch.compile()` to work in time for the run. +Both are expected to provide significant speedups. +This training run used model settings that are optimal for compiled models, despite not being able to compile, +because we want it to be a representative model for the downstream evaluations. + + 2023-03-28 ---------- From 8cc0bb224f559746196358476cc3402e8a332e96 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 5 Apr 2023 17:12:18 -0700 Subject: [PATCH 7/7] Some more noodling around to make click happy --- scripts/upload_artifact.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py index 5c65dd999..ee41f5a6b 100644 --- a/scripts/upload_artifact.py +++ b/scripts/upload_artifact.py @@ -25,6 +25,11 @@ def main( wandb_run_path: str, files_or_directories: Tuple[Path], ): + """ + Uploads artifacts to GCS. This uploads to a hardcoded bucket in GCS, because that's where we expect to keep all the artifacts for OLMo. + + WANDB_RUN_PATH: The "Weights and Biases" run path. You get this by going to the run in wandb and clicking on the "copy run path" button. We will use this as the prefix for the paths in the GCS bucket. + """ storage_client = storage.Client() bucket = storage_client.bucket("allennlp-olmo", "ai2-allennlp") prefix = wandb_run_path.strip("/")