From d2e237107534d256289c326496a6c46d9b9e85ea Mon Sep 17 00:00:00 2001 From: abhijeet-dhumal Date: Mon, 16 Dec 2024 17:36:15 +0530 Subject: [PATCH] Add MNIST test to run multi-node distributed training using KFTO --- tests/kfto/kfto_mnist_training_test.go | 269 +++++++++++++++++++++ tests/kfto/resources/mnist.py | 224 +++++++++++++++++ tests/kfto/resources/requirements-rocm.txt | 5 + tests/kfto/resources/requirements.txt | 4 + 4 files changed, 502 insertions(+) create mode 100644 tests/kfto/kfto_mnist_training_test.go create mode 100644 tests/kfto/resources/mnist.py create mode 100644 tests/kfto/resources/requirements-rocm.txt create mode 100644 tests/kfto/resources/requirements.txt diff --git a/tests/kfto/kfto_mnist_training_test.go b/tests/kfto/kfto_mnist_training_test.go new file mode 100644 index 00000000..9a7d4980 --- /dev/null +++ b/tests/kfto/kfto_mnist_training_test.go @@ -0,0 +1,269 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kfto + +import ( + "bytes" + "fmt" + "os" + "testing" + + . "github.com/onsi/gomega" + . "github.com/project-codeflare/codeflare-common/support" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" +) + +func TestPyTorchJobMnistCpu(t *testing.T) { + runKFTOPyTorchMnistJob(t, 0, "", GetCudaTrainingImage(), "resources/requirements.txt") +} + +func TestPyTorchJobMnistWithCuda(t *testing.T) { + runKFTOPyTorchMnistJob(t, 1, "nvidia.com/gpu", GetCudaTrainingImage(), "resources/requirements.txt") +} + +func TestPyTorchJobMnistWithROCm(t *testing.T) { + runKFTOPyTorchMnistJob(t, 1, "amd.com/gpu", GetROCmTrainingImage(), "resources/requirements-rocm.txt") +} + +func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, gpuLabel string, image string, requirementsFile string) { + test := With(t) + + // Create a namespace + namespace := test.NewTestNamespace() + + workingDirectory, err := os.Getwd() + test.Expect(err).ToNot(HaveOccurred()) + + mnist, err := os.ReadFile(workingDirectory + "/resources/mnist.py") + test.Expect(err).ToNot(HaveOccurred()) + + requirementsFileName, err := os.ReadFile(workingDirectory + "/" + requirementsFile) + if numGpus > 0 { + mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"gpu\""), 1) + } else { + mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"cpu\""), 1) + } + config := CreateConfigMap(test, namespace.Name, map[string][]byte{ + // MNIST Ray Notebook + "mnist.py": mnist, + "requirements.txt": requirementsFileName, + }) + + // Create PVC for trained model + outputPvc := CreatePersistentVolumeClaim(test, namespace.Name, "50Gi", corev1.ReadWriteMany) + defer test.Client().Core().CoreV1().PersistentVolumeClaims(namespace.Name).Delete(test.Ctx(), outputPvc.Name, metav1.DeleteOptions{}) + + // Create training PyTorch job + tuningJob := createKFTOPyTorchMnistJob(test, namespace.Name, *config, gpuLabel, numGpus, outputPvc.Name, image) + defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace.Name).Delete(test.Ctx(), tuningJob.Name, *metav1.NewDeleteOptions(0)) + + // Make sure the PyTorch job is running + test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble). + Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue))) + + // Make sure the PyTorch job succeeded + test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue))) + test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name) + +} + +func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.ConfigMap, gpuLabel string, numGpus int, outputPvcName string, baseImage string) *kftov1.PyTorchJob { + + var useGPU = false + var backend = "" + + if gpuLabel == "nvidia.com/gpu" && numGpus > 0 { + useGPU = true + backend = "nccl" + } + if gpuLabel == "amd.com/gpu" && numGpus > 0 { + useGPU = true + backend = "mpi" + } + if backend == "" { + backend = "gloo" + } + + tuningJob := &kftov1.PyTorchJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "PyTorchJob", + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "kfto-mnist-", + }, + Spec: kftov1.PyTorchJobSpec{ + PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{ + "Master": { + Replicas: Ptr(int32(1)), + RestartPolicy: kftov1.RestartPolicyOnFailure, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "pytorch", + Image: baseImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{ + "/bin/bash", "-c", + fmt.Sprintf(`python -m venv /tmp/venv && \ + source /tmp/venv/bin/activate && \ + pip install --no-cache-dir -r /mnt/files/requirements.txt && \ + python /mnt/files/mnist.py --epochs 1 --save-model --backend %s`, backend), + }, + Env: []corev1.EnvVar{ + { + Name: "MASTER_ADDR", + Value: "mnist-pytorch-distributed-master-0.mnist-pytorch-distributed-master", + }, + { + Name: "MASTER_PORT", + Value: "29500", + }, + { + Name: "WORLD_SIZE", + Value: "3", + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: config.Name, + MountPath: "/mnt/files", + ReadOnly: true, + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: config.Name, + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyOnFailure, + }, + }, + }, + "Worker": { + Replicas: Ptr(int32(1)), + RestartPolicy: kftov1.RestartPolicyOnFailure, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "pytorch", + Image: baseImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{ + "/bin/bash", "-c", + fmt.Sprintf(`python -m venv /tmp/venv && \ + source /tmp/venv/bin/activate && \ + pip install --no-cache-dir -r /mnt/files/requirements.txt && \ + python /mnt/files/mnist.py --epochs 1 --save-model --backend %s`, backend), + }, + Env: []corev1.EnvVar{ + { + Name: "MASTER_ADDR", + Value: "mnist-pytorch-distributed-master-0.mnist-pytorch-distributed-master", + }, + { + Name: "MASTER_PORT", + Value: "29500", + }, + { + Name: "WORLD_SIZE", + Value: "3", + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: config.Name, + MountPath: "/mnt/files", + ReadOnly: true, + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: config.Name, + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyOnFailure, + }, + }, + }, + }, + }, + } + + if useGPU { + // Update resource lists + tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + corev1.ResourceName(gpuLabel): resource.MustParse(fmt.Sprint(numGpus)), + }, + } + tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("8Gi"), + corev1.ResourceName(gpuLabel): resource.MustParse(fmt.Sprint(numGpus)), + }, + } + + // Update tolerations + tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Tolerations = []corev1.Toleration{ + { + Key: gpuLabel, + Operator: corev1.TolerationOpExists, + }, + } + tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Tolerations = []corev1.Toleration{ + { + Key: gpuLabel, + Operator: corev1.TolerationOpExists, + }, + } + } + + tuningJob, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created PytorchJob %s/%s successfully", tuningJob.Namespace, tuningJob.Name) + + return tuningJob +} diff --git a/tests/kfto/resources/mnist.py b/tests/kfto/resources/mnist.py new file mode 100644 index 00000000..4ccd0519 --- /dev/null +++ b/tests/kfto/resources/mnist.py @@ -0,0 +1,224 @@ +from __future__ import print_function + +import argparse +import os + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from tensorboardX import SummaryWriter +from torch.utils.data import DistributedSampler +from torchvision import datasets, transforms + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.Linear(4 * 4 * 50, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4 * 4 * 50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def train(args, model, device, train_loader, epoch, writer): + model.train() + optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) + + for batch_idx, (data, target) in enumerate(train_loader): + # Attach tensors to the device. + data, target = data.to(device), target.to(device) + + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( + epoch, + batch_idx * len(data), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + niter = epoch * len(train_loader) + batch_idx + writer.add_scalar("loss", loss.item(), niter) + + +def test(model, device, test_loader, writer, epoch): + model.eval() + + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + # Attach tensors to the device. + data, target = data.to(device), target.to(device) + + output = model(data) + # Get the index of the max log-probability. + pred = output.max(1, keepdim=True)[1] + correct += pred.eq(target.view_as(pred)).sum().item() + + print("\naccuracy={:.4f}\n".format(float(correct) / len(test_loader.dataset))) + writer.add_scalar("accuracy", float(correct) / len(test_loader.dataset), epoch) + + +def main(): + # Training settings + parser = argparse.ArgumentParser(description="PyTorch FashionMNIST Example") + parser.add_argument( + "--batch-size", + type=int, + default=64, + metavar="N", + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=1000, + metavar="N", + help="input batch size for testing (default: 1000)", + ) + parser.add_argument( + "--epochs", + type=int, + default=1, + metavar="N", + help="number of epochs to train (default: 10)", + ) + parser.add_argument( + "--lr", + type=float, + default=0.01, + metavar="LR", + help="learning rate (default: 0.01)", + ) + parser.add_argument( + "--momentum", + type=float, + default=0.5, + metavar="M", + help="SGD momentum (default: 0.5)", + ) + parser.add_argument( + "--no-cuda", + action="store_true", + default=False, + help="disables CUDA training", + ) + parser.add_argument( + "--seed", + type=int, + default=1, + metavar="S", + help="random seed (default: 1)", + ) + parser.add_argument( + "--log-interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--save-model", + action="store_true", + default=False, + help="For Saving the current Model", + ) + parser.add_argument( + "--dir", + default="logs", + metavar="L", + help="directory where summary logs are stored", + ) + + parser.add_argument( + "--backend", + type=str, + help="Distributed backend", + choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], + default=dist.Backend.GLOO, + ) + + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + if use_cuda: + print("Using CUDA") + if args.backend != dist.Backend.NCCL: + print( + "Warning. Please use `nccl` distributed backend for the best performance using GPUs" + ) + + writer = SummaryWriter(args.dir) + + torch.manual_seed(args.seed) + + device = torch.device("cuda" if use_cuda else "cpu") + + # Attach model to the device. + model = Net().to(device) + + print("Using distributed PyTorch with {} backend".format(args.backend)) + # Set distributed training environment variables to run this training script locally. + if "WORLD_SIZE" not in os.environ: + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = "1" + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "1234" + + print(f"World Size: {os.environ['WORLD_SIZE']}. Rank: {os.environ['RANK']}") + + dist.init_process_group(backend=args.backend) + model = nn.parallel.DistributedDataParallel(model) + + # Get FashionMNIST train and test dataset. + train_ds = datasets.FashionMNIST( + "../data", + train=True, + download=True, + transform=transforms.Compose([transforms.ToTensor()]), + ) + test_ds = datasets.FashionMNIST( + "../data", + train=False, + download=True, + transform=transforms.Compose([transforms.ToTensor()]), + ) + # Add train and test loaders. + train_loader = torch.utils.data.DataLoader( + train_ds, + batch_size=args.batch_size, + sampler=DistributedSampler(train_ds), + ) + test_loader = torch.utils.data.DataLoader( + test_ds, + batch_size=args.test_batch_size, + sampler=DistributedSampler(test_ds), + ) + + for epoch in range(1, args.epochs + 1): + train(args, model, device, train_loader, epoch, writer) + test(model, device, test_loader, writer, epoch) + + if args.save_model: + torch.save(model.state_dict(), "mnist_cnn.pt") + + +if __name__ == "__main__": + main() diff --git a/tests/kfto/resources/requirements-rocm.txt b/tests/kfto/resources/requirements-rocm.txt new file mode 100644 index 00000000..4165c8c7 --- /dev/null +++ b/tests/kfto/resources/requirements-rocm.txt @@ -0,0 +1,5 @@ +--extra-index-url https://download.pytorch.org/whl/rocm6.1 +torch==2.4.0+rocm6.1 +torchvision==0.19.0 +tensorboardx==2.6.2 +minio==7.2.12 diff --git a/tests/kfto/resources/requirements.txt b/tests/kfto/resources/requirements.txt new file mode 100644 index 00000000..9ccaad26 --- /dev/null +++ b/tests/kfto/resources/requirements.txt @@ -0,0 +1,4 @@ +torch==2.4.0 +torchvision==0.19.0 +tensorboardx==2.6.2 +minio==7.2.12 \ No newline at end of file