Skip to content

Commit

Permalink
Release 0.1.0 (#14)
Browse files Browse the repository at this point in the history
* prepare setup.py and requirements.txt
* synchronized initial weights
* setting up mnist unit test
* add a pytest.ini for automatic testing

Co-authored-by: Abhinav Bhatele <[email protected]>
  • Loading branch information
siddharth9820 and bhatele committed Apr 23, 2022
1 parent 7397cfd commit ccd878b
Show file tree
Hide file tree
Showing 15 changed files with 134 additions and 56 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: unit tests
name: formatting tests

on:
push:
Expand All @@ -7,35 +7,30 @@ on:
branches: [ develop ]

jobs:
build:

formatting:
runs-on: ${{ matrix.os }}

strategy:
matrix:
os: [ubuntu-latest, macos-latest]
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
python-version: 3.9

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Update black
if: ${{ matrix.python-version == 3.9 }}
run: |
pip install --upgrade black
- name: Lint and Format Check with flake8 and black
if: ${{ matrix.python-version == 3.9 }}
run: |
black --diff --check .
flake8
34 changes: 34 additions & 0 deletions .github/workflows/nvidia-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: nvidia-rtx-3090 tests

on:
push:
branches: [ develop ]
pull_request:
branches: [ develop ]

jobs:
mnist-trainer:
runs-on: [ nvidia ]

strategy:
matrix:
ginter: [ 1, 2 ]
memopt: [ '0', '1' ]
steps:
- uses: actions/checkout@v3
- name: Install AxoNN
run: |
pip install -r requirements.txt
- name: Download dataset
run: |
python -c "import torchvision; torchvision.datasets.MNIST(root=\"./axonn/tests\", download=True, train=True)"
- name: Train
run: |
export G_inter=${{ matrix.ginter }}
export G_data=$(( 2 / G_inter ))
export memopt=${{ matrix.memopt }}
echo "training with G_inter = ${G_inter}, G_data = $(( 2 / G_inter )) ${{ matrix.memopt }}"
mpirun -n 2 pytest --with-mpi
- name: Uninstall AxoNN
run: |
pip uninstall --yes axonn
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@
[![Build Status](https://github.com/hpcgroup/axonn/actions/workflows/unit-tests.yaml/badge.svg)](https://github.com/hpcgroup/axonn/actions)
[![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)

A parallel framework for training deep neural networks.
AxoNN is a parallel framework for training deep neural networks.

### Installation
Prior to the installation, [PyTorch](https://pytorch.org/get-started/locally/) must already be installed.

```bash
pip install axonn
```

### Contributing

Expand Down
4 changes: 4 additions & 0 deletions axonn/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
# See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4 changes: 4 additions & 0 deletions axonn/axonn.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ def _initialize_mixed_precision_with_cpu_offload(
return model, optimizer


@torch.no_grad()
def register_model_and_optimizer(model_shard, optimizer):
"""AxoNN's user facing function to register a model shard and
the corresponding optimizer.
Expand Down Expand Up @@ -397,6 +398,9 @@ def register_model_and_optimizer(model_shard, optimizer):
model_params.div_(config.G_data), async_op=False
) # sync all parameters across data parallel ranks

if computation_dtype == torch.float16:
model_params_fp32.copy_(model_params_fp16)

fp32_optimizer = optimizer
fp32_optimizer.skip_next_step = False

Expand Down
7 changes: 6 additions & 1 deletion axonn/optim.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
# See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

import torch
from torch.optim.optimizer import Optimizer
from . import axonn as ax
Expand All @@ -15,7 +20,7 @@ def __init__(
lr=1e-3,
betas=(0.9, 0.999),
eps=1e-8,
weight_decay=1e-2,
weight_decay=0,
bucket_size=16000000,
coalescing_factor=4,
):
Expand Down
48 changes: 29 additions & 19 deletions examples/test_vit.py → axonn/tests/test_vit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,32 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception


from axonn import axonn as ax
from axonn import optim
import torchvision
from external.models.vit import DistributedViT
from torchvision.transforms import ToTensor
import torch
from tqdm import tqdm
import pytest
import os


@pytest.mark.mpi
def test_vit_mnist():
bs_per_gpu = 64
num_gpus = 6
bs = num_gpus * bs_per_gpu
mbs = bs_per_gpu
epochs = 10
cpu_offload = True
N, D, H = 12, 768, 12
from axonn import axonn as ax
from axonn import optim

G_inter = int(os.environ.get("G_inter"))
assert 6 % G_inter == 0
G_data = int(os.environ.get("G_data"))
bs = int(os.environ.get("batch_size", 64))
mbs = int(os.environ.get("micro_batch_size", 16))
epochs = int(os.environ.get("epochs", 10))
cpu_offload = bool(os.environ.get("memopt"))
N, D, H = 6, 128, 8

ax.init(
G_data=2,
G_inter=3,
G_data=G_data,
G_inter=G_inter,
mixed_precision=True,
fp16_allreduce=True,
cpu_offload=cpu_offload,
Expand Down Expand Up @@ -52,22 +57,19 @@ def test_vit_mnist():
if cpu_offload:
optimizer = optim.CPUAdam(model.parameters(), lr=0.001)
else:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
ax.register_model_and_optimizer(model, optimizer)

ax.register_loss_fn(torch.nn.CrossEntropyLoss())

train_dataset = torchvision.datasets.MNIST(
root="./examples/dataset/", train=True, transform=ToTensor()
root="./axonn/tests", train=True, transform=ToTensor()
)
train_loader = ax.create_dataloader(train_dataset, bs, mbs, 0)

previous_model_state_memory = None
for epoch_number in range(epochs):
epoch_loss = 0
for x, y in tqdm(
train_loader,
disable=not (ilp_rank == 0 and ax.config.data_parallel_rank == 0),
):
for x, y in tqdm(train_loader, disable=True):
optimizer.zero_grad()
if ilp_rank == 0:
x, y = x.cuda(), y.cuda()
Expand All @@ -80,10 +82,18 @@ def test_vit_mnist():
batch_loss = ax.run_batch(x, y, eval_mode=False)
optimizer.step()
epoch_loss += batch_loss
current_model_state_memory = torch.cuda.memory_allocated()
assert (not previous_model_state_memory) or (
current_model_state_memory == previous_model_state_memory
), "model state memory should stay the same throughout training"
if ilp_rank == G_inter - 1:
ax.print_status(
f"Epoch {epoch_number+1} : epoch loss {epoch_loss/len(train_loader)}"
f": model state memory = {torch.cuda.memory_allocated()/2**30} GB"
)

assert epoch_loss / len(train_loader) < 0.1, "model did not converge"


test_vit_mnist()
if __name__ == "__main__":
test_vit_mnist()
5 changes: 5 additions & 0 deletions examples/ptb_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
# See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

from torch.utils.data import Dataset
import torch
import os
Expand Down
5 changes: 5 additions & 0 deletions examples/test_lm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
# See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

from axonn import axonn as ax
from axonn import optim
from external.models.nvidia_transformer import DistributedGPT
Expand Down
5 changes: 5 additions & 0 deletions examples/wikitext_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
# See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

import transformers
import os
from tqdm import tqdm
Expand Down
14 changes: 14 additions & 0 deletions external/models/nvidia_transformer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os

Expand Down
9 changes: 9 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright 2022 Parallel Software and Systems Group, University of Maryland.
# See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

[pytest]
addopts = --durations=20 -ra
testpaths = axonn/tests
python_files = *.py
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
torchvision
einops
tqdm

-e .
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="axonn",
version="0.0.1",
version="0.1.0",
description="A parallel library for extreme-scale deep learning",
long_description="""An asynchronous, message-driven parallel framework for
extreme-scale deep learning""",
Expand All @@ -18,4 +18,5 @@
classifiers=["Development Status :: 2 - Pre-Alpha"],
keywords="deep learning, distributed computing, parallel computing",
packages=find_packages(),
install_requires=["torch", "mpi4py"],
)
25 changes: 0 additions & 25 deletions train.sh

This file was deleted.

0 comments on commit ccd878b

Please sign in to comment.