Added HPO w/Optuna tutorial.

accre · Mar 10, 2024 · 22874f7 · 22874f7
1 parent d918c03
commit 22874f7
Show file tree

Hide file tree

Showing 2 changed files with 298 additions and 0 deletions.
diff --git a/docs/modelScripts/hpo_pytorch_singlegpu.py b/docs/modelScripts/hpo_pytorch_singlegpu.py
@@ -0,0 +1,131 @@
+#This example trains a sequential nueral network and logs
+#our model and some paramterts/metric of interest with MLflow
+
+import torch
+import mlflow
+import optuna
+from torch.utils.data import Dataset
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+from torch.utils.data import DataLoader
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+import os
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Training on device: ", device)
+
+
+class SeqNet(nn.Module):
+    def __init__(self, input_size, hidden_size1, hidden_size2,  output_size):
+        super(SeqNet, self).__init__()
+
+        self.lin1 = nn.Linear(input_size, hidden_size1)
+        self.lin2 = nn.Linear(hidden_size1, hidden_size2)
+        self.lin3 = nn.Linear(hidden_size2, output_size)
+
+
+    def forward(self, x):
+        x = torch.flatten(x,1)
+        x = self.lin1(x)
+        x = F.sigmoid(x)
+        x = self.lin2(x)
+        x = F.log_softmax(x, dim=1)
+        out = self.lin3(x)
+        return out
+
+def train(model, train_loader, loss_function, optimizer, num_epochs):
+    #model.to(device)
+
+    for epoch in range(num_epochs):
+
+        running_loss = 0.0
+        model.train()
+
+
+        for i ,(images,labels) in enumerate(train_loader):
+            images = torch.div(images, 255.)
+            images, labels = images.to(device), labels.to(device)
+
+            optimizer.zero_grad()
+            outputs = model(images)
+            loss = loss_function(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item()
+
+        average_loss = running_loss / len(train_loader)
+
+
+        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}')
+
+    print('Training finished.')
+def objective(trial):
+
+
+  input_size = 784
+  #hidden_size1 = 200
+  hidden_size1 = trial.suggest_int('hidden_size1', 100, 300)
+  #hidden_size2 = 200
+  hidden_size2 = trial.suggest_int('hidden_size2', 100, 300)
+  output_size = 10
+  num_epochs = 10
+  batch_size = 100
+  lr = 0.01
+
+
+  #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+  #print("Training on device: ", device)
+  my_net = SeqNet(input_size, hidden_size1, hidden_size2, output_size)
+  my_net = my_net.to(device)
+
+
+  optimizer = torch.optim.Adam( my_net.parameters(), lr=lr) 
+  loss_function = nn.CrossEntropyLoss()
+
+  fmnist_train = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
+  fmnist_test = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())
+
+  fmnist_train_loader = DataLoader(fmnist_train, batch_size=batch_size, shuffle=True) 
+  fmnist_test_loader = DataLoader(fmnist_test, batch_size=batch_size, shuffle=True)
+
+  train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs) 
+
+
+
+
+  correct = 0
+  total = 0
+  for images,labels in fmnist_test_loader:
+    images = torch.div(images, 255.)
+    images = images.to(device)
+    labels = labels.to(device)
+    output = my_net(images)
+    _, predicted = torch.max(output,1)
+    correct += (predicted == labels).sum()
+    total += labels.size(0)
+
+  #print('Accuracy of the model: %.3f %%' %((100*correct)/(total+1)))
+  acc = ((100*correct)/(total+1))
+  return acc
+with mlflow.start_run():  
+
+  study = optuna.create_study(direction='maximize')
+  study.optimize(objective, n_trials=5)
+
+  mlflow.log_params(study.best_params)
+  mlflow.log_metric("best_acc", study.best_value)
+
+  print("Best trial:")
+  trial = study.best_trial
+
+  print("  Value: ", trial.value)
+
+  print("  Params: ")
+  for key, value in trial.params.items():
+      print("    {}: {}".format(key, value))
diff --git a/docs/tutorials/HPO.md b/docs/tutorials/HPO.md
@@ -0,0 +1,167 @@
+---
+layout: default
+parent: Tutorials
+nav_order: 7
+---
+
+Hyperparameter Optimization with Optuna
+======================
+In place of grid or random search approaches to HPO, we recommend the use of the Optuna framework for Bayesian hyperparameter sampling and trial pruning (in models where intermediate results are available). Optuna can also integrate with MLflow for convinient logging of optimal parameters.
+
+In this tutorial, we take the model and training approach detailed in the [Single-GPU Training (Custom Mlflow)](({% link pytorch_singlGPU_customMLflow.md %}) tutorial to build our HPO on.
+
+First, we install the Optuna package:
+```bash
+pip install optuna
+```
+
+We will now make adjustments to our training script to test a series of hyperparameters. This entails three main parts:
+1. Wrap the whole of our model definition, training, and testing logic in an `objective` function that returns our chosen evaluation metric.
+2. Suggest hyperparameters to test using Optuna's `trial.suggest_<type>()` method.
+3. Initiate a `study` with the number of trials we would like to run. 
+
+To use Optuna in our training scripts, we first import the Optuna package (in addition to those required by the model).
+```python
+import optuna
+``` 
+For the model detailed in [Single-GPU Training (Custom Mlflow)](({% link pytorch_singlGPU_customMLflow.md %}), ignoring MLflow-related code, our `objective` function looks like this :
+
+```python
+def objective(trial):
+
+  input_size = 784
+  #hidden_size1 = 200
+  hidden_size1 = trial.suggest_int('hidden_size1', 100, 300)
+  #hidden_size2 = 200
+  hidden_size2 = trial.suggest_int('hidden_size2', 100, 300)
+  output_size = 10
+  num_epochs = 4
+  batch_size = 100
+  lr = 0.01
+
+
+  my_net = SeqNet(input_size, hidden_size1, hidden_size2, output_size)
+  my_net = my_net.to(device)
+
+
+  optimizer = torch.optim.Adam( my_net.parameters(), lr=lr)
+  loss_function = nn.CrossEntropyLoss()
+
+  fmnist_train = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
+  fmnist_test = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())
+
+  fmnist_train_loader = DataLoader(fmnist_train, batch_size=batch_size, shuffle=True)
+  fmnist_test_loader = DataLoader(fmnist_test, batch_size=batch_size, shuffle=True)
+
+  train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs)
+
+  correct = 0
+  total = 0
+  for images,labels in fmnist_test_loader:
+    images = torch.div(images, 255.)
+    images = images.to(device)
+    labels = labels.to(device)
+    output = my_net(images)
+    _, predicted = torch.max(output,1)
+    correct += (predicted == labels).sum()
+    total += labels.size(0)
+
+  #print('Accuracy of the model: %.3f %%' %((100*correct)/(total+1)))
+  acc = ((100*correct)/(total+1))
+  return acc
+```
+where, instead of explicity setting the size of our hidden layers, we let Optuna suggest values by using `trial.suggest_int()` and passing the variable name, and lower/upper limits on the range we'd like to test.
+
+It is important that this function returns the desired evaluation metric. In this case, we use accuracy `acc`.
+
+In our main code, we can now instantiate a `study` and begin optimizing:
+```python
+study = optuna.create_study(direction='maximize')
+study.optimize(objective, n_trials=10)
+```
+
+One can access and print the parameters for each trial and the optimal parameters as follow:
+```python
+print("Best trial:")
+trial = study.best_trial
+
+print("  Value: ", trial.value)
+
+print("  Params: ")
+for key, value in trial.params.items():
+    print("    {}: {}".format(key, value))
+``` 
+
+Another useful feature of Optuna is trial pruning. To implement this, we must report our evaluation score at each step using `trial.report(intermediate_value, step)` and selecting a pruning method when creatind our study:
+```python
+ study = optuna.create_study(pruner = optuna.pruners.SuccessiveHalvingPruner(), direction= "maximize") 
+```
+More information on pruning with Optuna can be found [here](https://optuna.readthedocs.io/en/v2.0.0/tutorial/pruning.html).
+
+We can also track and log our best parameters and best evaluation metric in MLflow by wrapping our main code in an MLflow run:
+```python
+with mlflow.start_run():
+  study = optuna.create_study(direction='maximize')
+  study.optimize(objective, n_trials=10)
+
+  mlflow.log_params(study.best_params)
+  mlflow.log_metric("best_acc", study.best_value)
+```
+Note that one could also create an MLflow run for each individual trial by wrapping the `objective` function in an MLflow run as follows:
+def objective(trial):
+
+  #start MLflow run
+  with mlflow.start_run():
+
+    input_size = 784
+    #hidden_size1 = 200
+    hidden_size1 = trial.suggest_int('hidden_size1', 100, 300)
+    #hidden_size2 = 200
+    hidden_size2 = trial.suggest_int('hidden_size2', 100, 300)
+    output_size = 10
+    num_epochs = 4
+    batch_size = 100
+    lr = 0.01
+
+
+    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    #print("Training on device: ", device)
+    my_net = SeqNet(input_size, hidden_size1, hidden_size2, output_size)
+    my_net = my_net.to(device)
+
+
+    optimizer = torch.optim.Adam( my_net.parameters(), lr=lr)
+    loss_function = nn.CrossEntropyLoss()
+
+    fmnist_train = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
+    fmnist_test = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())
+
+    fmnist_train_loader = DataLoader(fmnist_train, batch_size=batch_size, shuffle=True)
+    fmnist_test_loader = DataLoader(fmnist_test, batch_size=batch_size, shuffle=True)
+
+    train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs)
+
+    #log params and model in current MLflow run
+
+    mlflow.log_params({"epochs": num_epochs, "lr" : lr})
+    mlflow.pytorch.log_model(my_net, "model")
+
+
+    correct = 0
+    total = 0
+    for images,labels in fmnist_test_loader:
+      images = torch.div(images, 255.)
+      images = images.to(device)
+      labels = labels.to(device)
+      output = my_net(images)
+      _, predicted = torch.max(output,1)
+      correct += (predicted == labels).sum()
+      total += labels.size(0)
+
+    #print('Accuracy of the model: %.3f %%' %((100*correct)/(total+1)))
+    acc = ((100*correct)/(total+1))
+    return acc
+```
+{: .note }
+Download the full script used in this example [here](https://github.com/accre/mltf/blob/main/docs/modelScripts/hpo_pytorch_singlegpu.py)
+