Merge pull request allenai#85 from allenai/UploadArtifact

Upload artifact
AlibabaPAI · Apr 6, 2023 · 9911b78 · 9911b78
2 parents ef3e157 + 8cc0bb2
commit 9911b78
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 1 deletion.
diff --git a/LOG.md b/LOG.md
@@ -9,6 +9,21 @@ That is, within each transformer block we compute `MLP(LN(x)) + Attention(LN(x))
 This allows to increase throughput because we can fuse the separate feed-forward and attention input projections into a single linear layer.
 We also experimented with [fusing the output projections](https://github.com/allenai/LLM/pull/79) into a single linear layer but that didn't help, possibly due to the overhead of concatenating the feed-forward and attention activations together.
 
+
+2023-04-02
+----------
+
+First training run! We trained a 300M model on about 70B tokens from C4.
+The purpose of this model is to give the other LLM teams something in our format that's not completely random,
+so they can test their evaluation and inference code.
+
+This ran on a single node only on AMD's cluster.
+On AMD hardware we're still missing Flash Attention, and we could not get `torch.compile()` to work in time for the run.
+Both are expected to provide significant speedups.
+This training run used model settings that are optimal for compiled models, despite not being able to compile,
+because we want it to be a representative model for the downstream evaluations.
+
+
 2023-03-28
 ----------
 

diff --git a/README.md b/README.md
@@ -59,3 +59,12 @@ gantry run \
 This may require a reservation on the Infiniband cluster.
 
 See the [Beaker documentation](https://beaker-docs.apps.allenai.org/distributed-training.html) for more information on distributed training.
+
+## Finding official runs
+
+We keep all of our runs in WandB under [the "ai2-llm" entity](https://wandb.ai/ai2-llm).
+We don't store model checkpoints in WandB. Those are in GCS under `gs://allennlp-olmo/<wandb_run_path>`.
+
+### Highlighted models
+
+ * 300M parameters, ~70B tokens, a starter model that's not completely random: https://wandb.ai/ai2-llm/LLM-scripts/runs/ed5krfk9
diff --git a/requirements.txt b/requirements.txt
@@ -16,4 +16,5 @@ wandb
 # triton
 # flash-attn
 logzio-python-handler
-boto3
+boto3
+google-cloud-storage
diff --git a/scripts/upload_artifact.py b/scripts/upload_artifact.py
@@ -0,0 +1,62 @@
+import logging
+from pathlib import Path
+from typing import Tuple
+
+import click
+from google.cloud import storage
+from tqdm import tqdm
+
+from dolma.util import prepare_cli_environment
+
+log = logging.getLogger(__name__)
+
+
+@click.command()
+@click.argument(
+    "wandb_run_path",
+    type=str,
+)
+@click.argument(
+    "files_or_directories",
+    nargs=-1,
+    type=click.Path(exists=True, dir_okay=True, path_type=Path),
+)
+def main(
+    wandb_run_path: str,
+    files_or_directories: Tuple[Path],
+):
+    """
+    Uploads artifacts to GCS. This uploads to a hardcoded bucket in GCS, because that's where we expect to keep all the artifacts for OLMo.
+
+    WANDB_RUN_PATH: The "Weights and Biases" run path. You get this by going to the run in wandb and clicking on the "copy run path" button. We will use this as the prefix for the paths in the GCS bucket.
+    """
+    storage_client = storage.Client()
+    bucket = storage_client.bucket("allennlp-olmo", "ai2-allennlp")
+    prefix = wandb_run_path.strip("/")
+
+    files_or_directories_in_a_special_variable_because_mypy_is_lame = [
+        (file_or_directory, prefix + "/" + file_or_directory.name) for file_or_directory in files_or_directories
+    ]
+    while len(files_or_directories_in_a_special_variable_because_mypy_is_lame) > 0:
+        file_or_directory, key = files_or_directories_in_a_special_variable_because_mypy_is_lame.pop()
+        if file_or_directory.is_file():
+            blob = bucket.blob(key)
+            with file_or_directory.open("rb") as f:
+                with tqdm.wrapattr(
+                    f,
+                    "read",
+                    total=file_or_directory.stat().st_size,
+                    miniters=1,
+                    desc=f"Uploading {file_or_directory} to gs://{bucket.name}/{key}",
+                ) as f:
+                    blob.upload_from_file(f, file_or_directory)
+        elif file_or_directory.is_dir():
+            for directory_entry in file_or_directory.iterdir():
+                files_or_directories_in_a_special_variable_because_mypy_is_lame.append(
+                    (directory_entry, key + "/" + directory_entry.name)
+                )
+
+
+if __name__ == "__main__":
+    prepare_cli_environment()
+    main()