From 885870aab56f2e4ce24850e89fd6882c8075c0dd Mon Sep 17 00:00:00 2001 From: Jethro Gaglione Date: Thu, 15 Feb 2024 16:38:51 -0600 Subject: [PATCH] Include snippets model-parallel --- .../train_pytorch_modelParallel.py | 28 +------ docs/modelScripts/train_pytorch_multigpu.py | 2 + docs/tutorials/.single.md.swp | Bin 12288 -> 12288 bytes docs/tutorials/ddp.md | 3 +- docs/tutorials/model-parallel.md | 78 ++---------------- 5 files changed, 15 insertions(+), 96 deletions(-) diff --git a/docs/modelScripts/train_pytorch_modelParallel.py b/docs/modelScripts/train_pytorch_modelParallel.py index df0cd61..48f6e2f 100644 --- a/docs/modelScripts/train_pytorch_modelParallel.py +++ b/docs/modelScripts/train_pytorch_modelParallel.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + #This example trains a sequential nueral network and logs #our model and some paramterts/metric of interest with MLflow @@ -9,10 +11,6 @@ import torch.nn as nn import torch.nn.functional as F import torch.optim as optim -import torch.multiprocessing as mp -from torch.utils.data.distributed import DistributedSampler -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.distributed import init_process_group, destroy_process_group import os import sys @@ -39,10 +37,8 @@ def train(model, train_loader, loss_function, optimizer, num_epochs): running_loss = 0.0 model.train() - for i ,(images,labels) in enumerate(train_loader): images = torch.div(images, 255.) -# images, labels = images.to(device), labels.to(device) optimizer.zero_grad() outputs = model(images) @@ -54,9 +50,9 @@ def train(model, train_loader, loss_function, optimizer, num_epochs): average_loss = running_loss / len(train_loader) - print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}') + print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}") - print('Training finished.') + print("Training finished.") input_size = 784 @@ -70,10 +66,7 @@ def train(model, train_loader, loss_function, optimizer, num_epochs): if not torch.cuda.is_available(): sys.exit("A minimum of 2 GPUs must be available to train this model.") -#print("Training on device: ", device) my_net = SeqNet(input_size, hidden_size1, output_size) -#my_net = my_net.to(device) - optimizer = torch.optim.Adam( my_net.parameters(), lr=lr) loss_function = nn.CrossEntropyLoss() @@ -86,16 +79,3 @@ def train(model, train_loader, loss_function, optimizer, num_epochs): train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs) -""" -correct = 0 -total = 0 -for images,labels in fmnist_test_loader: - images = torch.div(images, 255.) - images = images.to(device) - labels = labels.to(device) - output = my_net(images) - _, predicted = torch.max(output,1) - correct += (predicted == labels).sum() - total += labels.size(0) -print('Accuracy of the model: %.3f %%' %((100*correct)/(total+1))) -""" diff --git a/docs/modelScripts/train_pytorch_multigpu.py b/docs/modelScripts/train_pytorch_multigpu.py index 154329a..5e046e7 100644 --- a/docs/modelScripts/train_pytorch_multigpu.py +++ b/docs/modelScripts/train_pytorch_multigpu.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + #This example trais a Pytorch model using DDP, which parallelized data #across miltiple GPUs #note: MLflow autolog is not functional on the latest version of Pytorch diff --git a/docs/tutorials/.single.md.swp b/docs/tutorials/.single.md.swp index 821634f600b821028a79dd64024d598624218ca5..e771269fc5b806683156ce5f6cf3ffe0e9a29e83 100644 GIT binary patch delta 13 UcmZojXh@hK!L)4FMyY4|045#