Skip to content

Commit

Permalink
add Docker images and run GPU tests on Beaker (allenai#14)
Browse files Browse the repository at this point in the history
* add image for gantry

* add make target for gantry testing

* GPU tests

* fix

* setup beaker

* fixes

* fix

* try fix

* try fix again

* try with direct clone

* try https

* try with custom GH token

* try again

* try again

* try again with default gh token

* fix

* fix gpu tests

* fix

* fix

* clean up
  • Loading branch information
epwalsh authored Mar 2, 2023
1 parent ba20a85 commit 7b3a1a7
Show file tree
Hide file tree
Showing 12 changed files with 250 additions and 8 deletions.
7 changes: 7 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.git
.github
.mypy_cache
.pytest_cache
.venv
__pycache__
*.egg-info
48 changes: 48 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,54 @@ jobs:
. .venv/bin/activate
pip uninstall -y dolma
gpu_tests:
name: GPU Tests
runs-on: ubuntu-latest
env:
BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
BEAKER_IMAGE: dolma-test
BEAKER_WORKSPACE: ai2/llm-testing
steps:
- name: Determine current commit SHA (pull request)
if: github.event_name == 'pull_request'
run: |
echo "COMMIT_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
- name: Determine current commit SHA (push)
if: github.event_name != 'pull_request'
run: |
echo "COMMIT_SHA=$GITHUB_SHA" >> $GITHUB_ENV
- name: GPU Tests
uses: allenai/[email protected]
if: env.BEAKER_TOKEN != ''
with:
spec: |
version: v2
description: GPU Tests
tasks:
- name: tests
image:
beaker: ${{ env.BEAKER_IMAGE }}
context:
priority: preemptible
resources:
gpuCount: 1
envVars:
- name: COMMIT_SHA
value: ${{ env.COMMIT_SHA }}
- name: GITHUB_TOKEN
value: ${{ secrets.GITHUB_TOKEN }}
- name: CUDA_LAUNCH_BLOCKING
value: "1"
- name: TOKENIZERS_PARALLELISM
value: "false"
command: ["/entrypoint.sh", "pytest", "-v", "-m", "gpu", "tests/"]
result:
path: /unused
token: ${{ env.BEAKER_TOKEN }}
workspace: ${{ env.BEAKER_WORKSPACE }}

release:
name: Release
runs-on: ubuntu-latest
Expand Down
14 changes: 14 additions & 0 deletions Dockerfile.gantry
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Defines a CUDA-enabled Docker image suitable for running this project's experiments
# via beaker-gantry.
#
# To build and push the image to Beaker, run 'make gantry-image'.
# To test the image after pushing to Beaker, run 'make gantry-test'.

FROM ghcr.io/allenai/pytorch:1.13.1-cuda11.7-python3.10

WORKDIR /stage

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

WORKDIR /app/dolma
14 changes: 14 additions & 0 deletions Dockerfile.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Defines a CUDA-enabled Docker image suitable for running GPU tests on Beaker
# via the GitHub Action 'beaker-run-action'.
# The image needs to exist on Beaker for the tests to work.
#
# To build and push the image to Beaker, run 'make test-image'.

FROM ghcr.io/allenai/pytorch:1.13.1-cuda11.7-python3.10

COPY scripts/test_entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

WORKDIR /testing

ENTRYPOINT ["/entrypoint.sh"]
67 changes: 67 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,74 @@
# If you update this, also update BEAKER_IMAGE in .github/workflows/main.yml
IMAGE_NAME_BASE = dolma
# If you update this, also update BEAKER_WORKSPACE in .github/workflows/main.yml
BEAKER_WORKSPACE = "ai2/llm-testing"

BEAKER_USER = $(shell beaker account whoami --format=json | jq -r '.[0].name')
GANTRY_IMAGE = $(shell beaker workspace images $(BEAKER_WORKSPACE) --format=json | jq -r -c '.[] | select( .name == "$(IMAGE_NAME_BASE)-gantry" ) | .fullName')
TEST_IMAGE = $(shell beaker workspace images $(BEAKER_WORKSPACE) --format=json | jq -r -c '.[] | select( .name == "$(IMAGE_NAME_BASE)-test" ) | .fullName')

.PHONY : run-checks
run-checks :
isort --check .
black --check .
flake8 .
mypy .
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes tests/

.PHONY : beaker-info
beaker-info :
@echo "Beaker user: $(BEAKER_USER)"
@echo "Gantry image: $(GANTRY_IMAGE)"
@echo "Testing image: $(TEST_IMAGE)"

.PHONY : gantry-image
gantry-image :
docker build -f Dockerfile.gantry -t $(IMAGE_NAME_BASE)-gantry .
beaker image create $(IMAGE_NAME_BASE)-gantry --name $(IMAGE_NAME_BASE)-gantry-tmp --workspace $(BEAKER_WORKSPACE)
beaker image delete $(GANTRY_IMAGE) || true
beaker image rename $(BEAKER_USER)/$(IMAGE_NAME_BASE)-gantry-tmp $(IMAGE_NAME_BASE)-gantry

.PHONY : test-image
test-image :
docker build -f Dockerfile.test -t $(IMAGE_NAME_BASE)-test .
beaker image create $(IMAGE_NAME_BASE)-test --name $(IMAGE_NAME_BASE)-test-tmp --workspace $(BEAKER_WORKSPACE)
beaker image delete $(TEST_IMAGE) || true
beaker image rename $(BEAKER_USER)/$(IMAGE_NAME_BASE)-test-tmp $(IMAGE_NAME_BASE)-test

.PHONY : show-test-image
show-test-image :
@echo $(TEST_IMAGE)

.PHONY : show-beaker-workspace
show-beaker-workspace :
@echo $(BEAKER_WORKSPACE)

.PHONY : gantry-test
gantry-test :
gantry run \
--workspace "$(BEAKER_WORKSPACE)" \
--priority "preemptible" \
--beaker-image "$(GANTRY_IMAGE)" \
--gpus 1 \
--description "Test run" \
--cluster ai2/allennlp-cirrascale \
--cluster ai2/aristo-cirrascale \
--cluster ai2/mosaic-cirrascale \
--cluster ai2/mosaic-cirrascale-a100 \
--cluster ai2/prior-cirrascale \
--cluster ai2/s2-cirrascale \
--cluster ai2/general-cirrascale \
--cluster ai2/general-cirrascale-a100-80g-ib \
--allow-dirty \
--venv base \
--timeout -1 \
--yes \
-- make check-cuda-install

.PHONY : check-cpu-install
check-cpu-install :
@python -c 'from dolma import check_install; check_install(cuda=False)'

.PHONY : check-cuda-install
check-cuda-install :
@python -c 'from dolma import check_install; check_install(cuda=True)'
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
# DOLMA: Delightful Open Language Model from AI2

## Setup

After cloning this repository, first install the latest [PyTorch](https://pytorch.org) according the official instructions relevant to your environment. Then install the remaining dependencies and code base by running:

```
pip install -e .[dev] --config-settings editable_mode=compat
```
14 changes: 13 additions & 1 deletion dolma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,16 @@
from .model import DolmaGPT, DolmaGPTOutput
from .tokenizer import Tokenizer, TruncationDirection

__all__ = ["Config", "Tokenizer", "TruncationDirection", "DolmaGPT", "DolmaGPTOutput"]
__all__ = ["Config", "Tokenizer", "TruncationDirection", "DolmaGPT", "DolmaGPTOutput", "check_install"]


def check_install(cuda: bool = False):
import torch

from .version import VERSION

if cuda:
assert torch.cuda.is_available(), "CUDA is not available!"
print("CUDA available")

print(f"DOLMA v{VERSION} installed")
2 changes: 1 addition & 1 deletion dolma/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def __init__(self, config: Config):
self.transformer.update(
{"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
)
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False, device=config.init_device)
if self.config.init_device != "meta":
self.apply(self.param_init_fn)

Expand Down
1 change: 1 addition & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ python_classes = Test* *Test
log_format = %(asctime)s - %(levelname)s - %(name)s - %(message)s
log_level = DEBUG
markers =
gpu: marks tests that need GPUs
filterwarnings =
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# NOTE: when upgrading requirements here you may have to rebuild and push some
# Docker images. See each Dockerfile for details on how to do that.
numpy
torch
mosaicml
Expand All @@ -6,3 +8,4 @@ tokenizers
click
rich
cached-path
beaker-gantry
38 changes: 38 additions & 0 deletions scripts/test_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

# Exit script if any commands fail.
set -e
set -o pipefail

# Check that the environment variables have been set correctly
for env_var in "$GITHUB_TOKEN" "$COMMIT_SHA"; do
if [[ -z "$env_var" ]]; then
echo >&2 "error: required environment variable $env_var is empty"
exit 1
fi
done

# Initialize conda for bash.
# See https://stackoverflow.com/a/58081608/4151392
eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"

# Install GitHub CLI.
conda install gh --channel conda-forge

# Configure git to use GitHub CLI as a credential helper so that we can clone private repos.
gh auth setup-git

# Clone and install tango.
mkdir LLM && cd LLM
gh repo clone allenai/LLM .
git checkout --quiet "$COMMIT_SHA"

# Install dependencies.
pip install --upgrade pip
pip install --no-cache-dir '.[dev]'

# Create directory for results.
mkdir -p /results

# Execute the arguments to this script as commands themselves, piping output into a log file.
exec "$@" 2>&1 | tee /results/out.log
42 changes: 36 additions & 6 deletions tests/model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,38 @@
from dolma.data import DataCollator, PaddingDirection


@pytest.mark.parametrize("alibi", [pytest.param(True, id="alibi-emb"), pytest.param(False, id="posit-emb")])
def test_forward(config: Config, tokenizer: Tokenizer, alibi: bool):
@pytest.mark.parametrize(
"alibi, cuda",
[
pytest.param(True, False, id="alibi-emb-cpu"),
pytest.param(False, False, id="posit-emb-cpu"),
pytest.param(
True,
True,
id="alibi-emb-cuda",
marks=(
pytest.mark.gpu,
pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA devices"),
),
),
pytest.param(
False,
True,
id="posit-emb-cuda",
marks=(
pytest.mark.gpu,
pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Requires CUDA devices"),
),
),
],
)
def test_forward(config: Config, tokenizer: Tokenizer, alibi: bool, cuda: bool):
torch.manual_seed(0)

config.alibi = alibi
if cuda:
config.init_device = "cuda"

model = DolmaGPT(config).eval()

input1 = tokenizer.encode("My name is DOLMA!")
Expand All @@ -21,11 +48,14 @@ def test_forward(config: Config, tokenizer: Tokenizer, alibi: bool):
{"input_ids": input2, "attention_mask": [1.0] * len(input2)},
]
)
batch_inputs = { # type: ignore
k: v.to(device=config.device) if isinstance(v, torch.Tensor) else v for k, v in batch_inputs.items()
}

# Check that logits from individual inputs are equal to logits from batch.
with torch.inference_mode():
output1 = model(torch.tensor(input1).unsqueeze(0))
output2 = model(torch.tensor(input2).unsqueeze(0))
output1 = model(torch.tensor(input1, device=config.device).unsqueeze(0))
output2 = model(torch.tensor(input2, device=config.device).unsqueeze(0))
batch_output = model(**batch_inputs)

torch.testing.assert_close(output1.logits[0][: len(input1)], batch_output.logits[0][: len(input1)])
Expand All @@ -40,7 +70,7 @@ def test_backward(config: Config, tokenizer: Tokenizer, alibi: bool):
model = DolmaGPT(config).train()

# Forward pass to get logits.
input_ids = torch.tensor(tokenizer.encode("My name is DOLMA!")).unsqueeze(0)
input_ids = torch.tensor(tokenizer.encode("My name is DOLMA!"), device=config.device).unsqueeze(0)
logits = model(input_ids).logits

# Compute loss.
Expand All @@ -55,7 +85,7 @@ def test_backward(config: Config, tokenizer: Tokenizer, alibi: bool):
for name, parameter in model.named_parameters():
if parameter.requires_grad:
assert parameter.grad is not None
zeros = torch.zeros(parameter.size())
zeros = torch.zeros(parameter.size(), device=config.device)
if (parameter.grad == zeros).all():
raise RuntimeError(f"{name} has zero a gradient!")
else:
Expand Down

0 comments on commit 7b3a1a7

Please sign in to comment.