Skip to content

Commit

Permalink
Include snippets model-parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
Jethro Gaglione committed Feb 15, 2024
1 parent 7cfdc3d commit 885870a
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 96 deletions.
28 changes: 4 additions & 24 deletions docs/modelScripts/train_pytorch_modelParallel.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

#This example trains a sequential nueral network and logs
#our model and some paramterts/metric of interest with MLflow

Expand All @@ -9,10 +11,6 @@
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
import sys

Expand All @@ -39,10 +37,8 @@ def train(model, train_loader, loss_function, optimizer, num_epochs):
running_loss = 0.0
model.train()


for i ,(images,labels) in enumerate(train_loader):
images = torch.div(images, 255.)
# images, labels = images.to(device), labels.to(device)

optimizer.zero_grad()
outputs = model(images)
Expand All @@ -54,9 +50,9 @@ def train(model, train_loader, loss_function, optimizer, num_epochs):
average_loss = running_loss / len(train_loader)


print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')
print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}")

print('Training finished.')
print("Training finished.")


input_size = 784
Expand All @@ -70,10 +66,7 @@ def train(model, train_loader, loss_function, optimizer, num_epochs):
if not torch.cuda.is_available():
sys.exit("A minimum of 2 GPUs must be available to train this model.")

#print("Training on device: ", device)
my_net = SeqNet(input_size, hidden_size1, output_size)
#my_net = my_net.to(device)


optimizer = torch.optim.Adam( my_net.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()
Expand All @@ -86,16 +79,3 @@ def train(model, train_loader, loss_function, optimizer, num_epochs):

train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs)

"""
correct = 0
total = 0
for images,labels in fmnist_test_loader:
images = torch.div(images, 255.)
images = images.to(device)
labels = labels.to(device)
output = my_net(images)
_, predicted = torch.max(output,1)
correct += (predicted == labels).sum()
total += labels.size(0)
print('Accuracy of the model: %.3f %%' %((100*correct)/(total+1)))
"""
2 changes: 2 additions & 0 deletions docs/modelScripts/train_pytorch_multigpu.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

#This example trais a Pytorch model using DDP, which parallelized data
#across miltiple GPUs
#note: MLflow autolog is not functional on the latest version of Pytorch
Expand Down
Binary file modified docs/tutorials/.single.md.swp
Binary file not shown.
3 changes: 2 additions & 1 deletion docs/tutorials/ddp.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ The purpose of this tutorial is to demonstrate the structure of Pytorch code mea

First we import the necessary libraries:
```python
{% include _includes/includesnippet filename='modelScripts/train_pytorch_multigpu.py' starttext='import torch' endtext='import os' %}
{% include _includes/includesnippet filename='modelScripts/train_pytorch_multigpu.py' starttext='import torch ' endtext='import os' %}
```

Then we run the necessary DDP configuration:
Expand Down Expand Up @@ -47,3 +47,4 @@ We can now write the part of our code that will check for the number of availabl
```
{: .note }
Download the full script used in this example [here](https://github.com/accre/mltf/blob/main/docs/modelScripts/train_pytorch_multigpu.py)

78 changes: 7 additions & 71 deletions docs/tutorials/model-parallel.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,88 +9,24 @@ The purpose of this tutorial is to describe a method to parallelize training in

First, we import the necessary packages. Nothing additional to the packages used in single-GPU training is necessary for this model-parallel approach.
```python
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import sys
{% include _includes/includesnippet filename='modelScripts/train_pytorch_modelParallel.py' starttext='import torch ' endtext='import sys' %}
```
Now we build our model and define our forward pass:
```python
class SeqNet(nn.Module):
def __init__(self, input_size, hidden_size1, output_size):
super(SeqNet, self).__init__()

self.lin1 = nn.Linear(input_size, hidden_size1).to('cuda:0')
self.lin2 = nn.Linear(hidden_size1, output_size).to('cuda:1')


def forward(self, x):
x = torch.flatten(x,1)
x = self.lin1(x.to('cuda:0'))
x = F.log_softmax(x, dim=1)
out = self.lin2(x.to('cuda:1'))
return out
{% include _includes/includesnippet filename='modelScripts/train_pytorch_modelParallel.py' starttext='class SeqNet(nn.Module):' endtext='return out' %}
```
This is where most of the work to parallelize our model is accomplished. We send each layer of our model to a different GPU by using `.to('cuda:0')` and `.to('cuda:1')` commands as we define our model layers. It's also important to note that each step of our forward pass must happen on the appropriate GPU by sending our `x` tensor to the correct place. We do this by again using `x.to('cuda:0')` and `x.to('cuda:1') commands.

We can now define our training function:
```python
def train(model, train_loader, loss_function, optimizer, num_epochs):

for epoch in range(num_epochs):

running_loss = 0.0
model.train()


for i ,(images,labels) in enumerate(train_loader):
images = torch.div(images, 255.)

optimizer.zero_grad()
outputs = model(images)
loss = loss_function(outputs, labels.to('cuda:1'))
loss.backward()
optimizer.step()
running_loss += loss.item()

average_loss = running_loss / len(train_loader)


print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')

print('Training finished.')
{% include _includes/includesnippet filename='modelScripts/train_pytorch_modelParallel.py' starttext='idef train(model,' endtext='print("Training finished.")' %}
```
where the labels for our loss function calculation must be sent to the device corresponding to the output of our model. In this case it is `cuda:1`.

We can now continue our training as usual:
```python
input_size = 784
hidden_size1 = 200
hidden_size2 = 200
output_size = 10
num_epochs = 10
batch_size = 100
lr = 0.01

if not torch.cuda.is_available():
sys.exit("A minimum of 2 GPUs must be available to train this model.")

my_net = SeqNet(input_size, hidden_size1, output_size)

optimizer = torch.optim.Adam( my_net.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

fmnist_train = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
fmnist_test = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())

fmnist_train_loader = DataLoader(fmnist_train, batch_size=batch_size, shuffle=True)
fmnist_test_loader = DataLoader(fmnist_test, batch_size=batch_size, shuffle=True)

train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs)
{% include _includes/includesnippet filename='modelScripts/train_pytorch_modelParallel.py' starttext='input_size = 784' endtext=', num_epochs)' %}
```
{: .note }
Download the full script used in this example [here](https://github.com/accre/mltf/blob/main/docs/modelScripts/train_pytorch_modelParallel.py)

0 comments on commit 885870a

Please sign in to comment.