Include snippets model-parallel

accre · Feb 15, 2024 · 885870a · 885870a
1 parent 7cfdc3d
commit 885870a
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 96 deletions.
diff --git a/docs/modelScripts/train_pytorch_modelParallel.py b/docs/modelScripts/train_pytorch_modelParallel.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 #This example trains a sequential nueral network and logs
 #our model and some paramterts/metric of interest with MLflow
 
@@ -9,10 +11,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
-from torch.utils.data.distributed import DistributedSampler
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.distributed import init_process_group, destroy_process_group
 import os
 import sys
 
@@ -39,10 +37,8 @@ def train(model, train_loader, loss_function, optimizer, num_epochs):
         running_loss = 0.0
         model.train()
 
-
         for i ,(images,labels) in enumerate(train_loader):
             images = torch.div(images, 255.)
-#            images, labels = images.to(device), labels.to(device)
 
             optimizer.zero_grad()
             outputs = model(images)
@@ -54,9 +50,9 @@ def train(model, train_loader, loss_function, optimizer, num_epochs):
         average_loss = running_loss / len(train_loader)
 
 
-        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')
+        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}")
 
-    print('Training finished.')
+    print("Training finished.")
 
 
 input_size = 784
@@ -70,10 +66,7 @@ def train(model, train_loader, loss_function, optimizer, num_epochs):
 if not torch.cuda.is_available():
   sys.exit("A minimum of 2 GPUs must be available to train this model.")
 
-#print("Training on device: ", device)
 my_net = SeqNet(input_size, hidden_size1, output_size)
-#my_net = my_net.to(device)
-
 
 optimizer = torch.optim.Adam( my_net.parameters(), lr=lr) 
 loss_function = nn.CrossEntropyLoss()
@@ -86,16 +79,3 @@ def train(model, train_loader, loss_function, optimizer, num_epochs):
 
 train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs) 
 
-"""
-correct = 0
-total = 0
-for images,labels in fmnist_test_loader:
-  images = torch.div(images, 255.)
-  images = images.to(device)
-  labels = labels.to(device)
-  output = my_net(images)
-  _, predicted = torch.max(output,1)
-  correct += (predicted == labels).sum()
-  total += labels.size(0)
-print('Accuracy of the model: %.3f %%' %((100*correct)/(total+1)))
-"""
diff --git a/docs/modelScripts/train_pytorch_multigpu.py b/docs/modelScripts/train_pytorch_multigpu.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 #This example trais a Pytorch model using DDP, which parallelized data
 #across miltiple GPUs
 #note: MLflow autolog is not functional on the latest version of Pytorch

diff --git a/docs/tutorials/.single.md.swp b/docs/tutorials/.single.md.swp
diff --git a/docs/tutorials/ddp.md b/docs/tutorials/ddp.md
@@ -10,7 +10,7 @@ The purpose of this tutorial is to demonstrate the structure of Pytorch code mea
 
 First we import the necessary libraries:
 ```python
-{% include _includes/includesnippet filename='modelScripts/train_pytorch_multigpu.py' starttext='import torch' endtext='import os' %}
+{% include _includes/includesnippet filename='modelScripts/train_pytorch_multigpu.py' starttext='import torch ' endtext='import os' %}
 ```
 
 Then we run the necessary DDP configuration:
@@ -47,3 +47,4 @@ We can now write the part of our code that will check for the number of availabl
 ```
 {: .note }
 Download the full script used in this example [here](https://github.com/accre/mltf/blob/main/docs/modelScripts/train_pytorch_multigpu.py)
+
diff --git a/docs/tutorials/model-parallel.md b/docs/tutorials/model-parallel.md
@@ -9,88 +9,24 @@ The purpose of this tutorial is to describe a method to parallelize training in
 
 First, we import the necessary packages. Nothing additional to the packages used in single-GPU training is necessary for this model-parallel approach.
 ```python
-import torch
-from torch.utils.data import Dataset
-from torchvision import datasets
-from torchvision.transforms import ToTensor
-from torch.utils.data import DataLoader
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import os
-import sys
+{% include _includes/includesnippet filename='modelScripts/train_pytorch_modelParallel.py' starttext='import torch ' endtext='import sys' %}
 ```
 Now we build our model and define our forward pass:
 ```python
-class SeqNet(nn.Module):
-    def __init__(self, input_size, hidden_size1, output_size):
-        super(SeqNet, self).__init__()
-
-        self.lin1 = nn.Linear(input_size, hidden_size1).to('cuda:0')
-        self.lin2 = nn.Linear(hidden_size1, output_size).to('cuda:1')
-
-
-    def forward(self, x):
-        x = torch.flatten(x,1)
-        x = self.lin1(x.to('cuda:0'))
-        x = F.log_softmax(x, dim=1)
-        out = self.lin2(x.to('cuda:1'))
-        return out
+{% include _includes/includesnippet filename='modelScripts/train_pytorch_modelParallel.py' starttext='class SeqNet(nn.Module):' endtext='return out' %}
 ```
 This is where most of the work to parallelize our model is accomplished. We send each layer of our model to a different GPU by using `.to('cuda:0')` and `.to('cuda:1')` commands as we define our model layers. It's also important to note that each step of our forward pass must happen on the appropriate GPU by sending our `x` tensor to the correct place. We do this by again using `x.to('cuda:0')` and `x.to('cuda:1') commands.
 
 We can now define our training function:
 ```python
-def train(model, train_loader, loss_function, optimizer, num_epochs):
-
-    for epoch in range(num_epochs):
-
-        running_loss = 0.0
-        model.train()
-
-
-        for i ,(images,labels) in enumerate(train_loader):
-            images = torch.div(images, 255.)
-
-            optimizer.zero_grad()
-            outputs = model(images)
-            loss = loss_function(outputs, labels.to('cuda:1'))
-            loss.backward()
-            optimizer.step()
-            running_loss += loss.item()
-
-        average_loss = running_loss / len(train_loader)
-
-
-        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')
-
-    print('Training finished.')
+{% include _includes/includesnippet filename='modelScripts/train_pytorch_modelParallel.py' starttext='idef train(model,' endtext='print("Training finished.")' %}
 ```
 where the labels for our loss function calculation must be sent to the device corresponding to the output of our model. In this case it is `cuda:1`.
 
 We can now continue our training as usual:
 ```python
-input_size = 784
-hidden_size1 = 200
-hidden_size2 = 200
-output_size = 10
-num_epochs = 10
-batch_size = 100
-lr = 0.01
-
-if not torch.cuda.is_available():
-  sys.exit("A minimum of 2 GPUs must be available to train this model.")
-
-my_net = SeqNet(input_size, hidden_size1, output_size)
-
-optimizer = torch.optim.Adam( my_net.parameters(), lr=lr)
-loss_function = nn.CrossEntropyLoss()
-
-fmnist_train = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
-fmnist_test = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())
-
-fmnist_train_loader = DataLoader(fmnist_train, batch_size=batch_size, shuffle=True)
-fmnist_test_loader = DataLoader(fmnist_test, batch_size=batch_size, shuffle=True)
-
-train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs)
+{% include _includes/includesnippet filename='modelScripts/train_pytorch_modelParallel.py' starttext='input_size = 784' endtext=', num_epochs)' %}
 ```
+{: .note }
+Download the full script used in this example [here](https://github.com/accre/mltf/blob/main/docs/modelScripts/train_pytorch_modelParallel.py)
+