From 03a6d4effb9223670f439c3a29198ef34938922f Mon Sep 17 00:00:00 2001 From: Dong Yang Date: Tue, 24 Oct 2023 02:25:41 -0600 Subject: [PATCH] [auto3dseg] enable mlflow in algorithms (#319) Enable mlflow for metric tracking in algorithms. --------- Signed-off-by: dongy --- .../algorithm_templates/dints/scripts/search.py | 12 ++++++++++++ auto3dseg/algorithm_templates/dints/scripts/train.py | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/auto3dseg/algorithm_templates/dints/scripts/search.py b/auto3dseg/algorithm_templates/dints/scripts/search.py index 5ce45184..55f9edb5 100644 --- a/auto3dseg/algorithm_templates/dints/scripts/search.py +++ b/auto3dseg/algorithm_templates/dints/scripts/search.py @@ -11,6 +11,8 @@ import logging import math +import mlflow +import mlflow.pytorch import os import random import sys @@ -291,6 +293,9 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): if torch.cuda.device_count() == 1 or dist.get_rank() == 0: writer = SummaryWriter(log_dir=os.path.join(arch_path, "Events")) + mlflow.set_tracking_uri(os.path.join(ckpt_path, "mlruns")) + + mlflow.start_run(run_name=f'dints - fold{fold} - search') with open(os.path.join(arch_path, "accuracy_history.csv"), "a") as f: f.write("epoch\tmetric\tloss\tlr\ttime\titer\n") @@ -355,6 +360,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): if torch.cuda.device_count() == 1 or dist.get_rank() == 0: logger.debug(f"[{str(datetime.now())[:19]}] " + f"{step}/{epoch_len}, train_loss: {loss.item():.4f}") writer.add_scalar("Loss/train", loss.item(), epoch_len * epoch + step) + mlflow.log_metric('Loss/train', loss.item(), step=epoch_len * epoch + step) if epoch < num_epochs_warmup: continue @@ -437,6 +443,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): f"[{str(datetime.now())[:19]}] " + f"{step}/{epoch_len}, train_loss_arch: {loss.item():.4f}" ) writer.add_scalar("train_loss_arch", loss.item(), epoch_len * epoch + step) + mlflow.log_metric('train_loss_arch', loss.item(), step=epoch_len * epoch + step) lr_scheduler.step() @@ -544,6 +551,9 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): avg_metric = avg_metric / float(metric_dim) logger.debug(f"avg_metric, {avg_metric}") + writer.add_scalar("val/acc", avg_metric, epoch) + mlflow.log_metric("val/acc", avg_metric, step=epoch) + if avg_metric > best_metric: best_metric = avg_metric best_metric_epoch = epoch + 1 @@ -615,6 +625,8 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): writer.flush() writer.close() + mlflow.end_run() + if torch.cuda.device_count() > 1: dist.destroy_process_group() diff --git a/auto3dseg/algorithm_templates/dints/scripts/train.py b/auto3dseg/algorithm_templates/dints/scripts/train.py index ef7fa4d8..e02160e3 100644 --- a/auto3dseg/algorithm_templates/dints/scripts/train.py +++ b/auto3dseg/algorithm_templates/dints/scripts/train.py @@ -17,6 +17,8 @@ import io import logging import math +import mlflow +import mlflow.pytorch import os import random import sys @@ -526,6 +528,9 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): if torch.cuda.device_count() == 1 or dist.get_rank() == 0: writer = SummaryWriter(log_dir=os.path.join(ckpt_path, "Events")) + mlflow.set_tracking_uri(os.path.join(ckpt_path, "mlruns")) + + mlflow.start_run(run_name=f'dints - fold{fold} - train') with open(os.path.join(ckpt_path, "accuracy_history.csv"), "a") as f: f.write("epoch\tmetric\tloss\tlr\ttime\titer\n") @@ -618,6 +623,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): f"[{str(datetime.now())[:19]}] " + f"{step}/{epoch_len}, train_loss: {loss.item():.4f}" ) writer.add_scalar("train/loss", loss.item(), epoch_len * _round + step) + mlflow.log_metric('train/loss', loss.item(), step=epoch_len * _round + step) lr_scheduler.step() @@ -760,8 +766,10 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): writer.add_scalar( f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1], epoch ) + mlflow.log_metric(f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1], step=epoch) except BaseException: writer.add_scalar(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1], epoch) + mlflow.log_metric(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1], step=epoch) avg_metric = 0 for _c in range(metric_dim): @@ -770,6 +778,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): logger.debug(f"avg_metric: {avg_metric}") writer.add_scalar("val/acc", avg_metric, epoch) + mlflow.log_metric("val/acc", avg_metric, step=epoch) if torch.cuda.device_count() > 1: torch.save(model.module.state_dict(), os.path.join(ckpt_path, "current_model.pt")) @@ -965,6 +974,8 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override): writer.flush() writer.close() + mlflow.end_run() + if torch.cuda.device_count() == 1 or dist.get_rank() == 0: if (not valid_at_orig_resolution_only) and es and (_round + 1) < num_rounds: logger.warning(f"{os.path.basename(bundle_root)} - training: finished with early stop")