Skip to content

Commit

Permalink
[auto3dseg] enable mlflow in algorithms (#319)
Browse files Browse the repository at this point in the history
Enable mlflow for metric tracking in algorithms.

---------

Signed-off-by: dongy <[email protected]>
  • Loading branch information
dongyang0122 authored Oct 24, 2023
1 parent b2809b5 commit 03a6d4e
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 0 deletions.
12 changes: 12 additions & 0 deletions auto3dseg/algorithm_templates/dints/scripts/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import logging
import math
import mlflow
import mlflow.pytorch
import os
import random
import sys
Expand Down Expand Up @@ -291,6 +293,9 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):

if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
writer = SummaryWriter(log_dir=os.path.join(arch_path, "Events"))
mlflow.set_tracking_uri(os.path.join(ckpt_path, "mlruns"))

mlflow.start_run(run_name=f'dints - fold{fold} - search')

with open(os.path.join(arch_path, "accuracy_history.csv"), "a") as f:
f.write("epoch\tmetric\tloss\tlr\ttime\titer\n")
Expand Down Expand Up @@ -355,6 +360,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
logger.debug(f"[{str(datetime.now())[:19]}] " + f"{step}/{epoch_len}, train_loss: {loss.item():.4f}")
writer.add_scalar("Loss/train", loss.item(), epoch_len * epoch + step)
mlflow.log_metric('Loss/train', loss.item(), step=epoch_len * epoch + step)

if epoch < num_epochs_warmup:
continue
Expand Down Expand Up @@ -437,6 +443,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
f"[{str(datetime.now())[:19]}] " + f"{step}/{epoch_len}, train_loss_arch: {loss.item():.4f}"
)
writer.add_scalar("train_loss_arch", loss.item(), epoch_len * epoch + step)
mlflow.log_metric('train_loss_arch', loss.item(), step=epoch_len * epoch + step)

lr_scheduler.step()

Expand Down Expand Up @@ -544,6 +551,9 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
avg_metric = avg_metric / float(metric_dim)
logger.debug(f"avg_metric, {avg_metric}")

writer.add_scalar("val/acc", avg_metric, epoch)
mlflow.log_metric("val/acc", avg_metric, step=epoch)

if avg_metric > best_metric:
best_metric = avg_metric
best_metric_epoch = epoch + 1
Expand Down Expand Up @@ -615,6 +625,8 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
writer.flush()
writer.close()

mlflow.end_run()

if torch.cuda.device_count() > 1:
dist.destroy_process_group()

Expand Down
11 changes: 11 additions & 0 deletions auto3dseg/algorithm_templates/dints/scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import io
import logging
import math
import mlflow
import mlflow.pytorch
import os
import random
import sys
Expand Down Expand Up @@ -526,6 +528,9 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):

if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
writer = SummaryWriter(log_dir=os.path.join(ckpt_path, "Events"))
mlflow.set_tracking_uri(os.path.join(ckpt_path, "mlruns"))

mlflow.start_run(run_name=f'dints - fold{fold} - train')

with open(os.path.join(ckpt_path, "accuracy_history.csv"), "a") as f:
f.write("epoch\tmetric\tloss\tlr\ttime\titer\n")
Expand Down Expand Up @@ -618,6 +623,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
f"[{str(datetime.now())[:19]}] " + f"{step}/{epoch_len}, train_loss: {loss.item():.4f}"
)
writer.add_scalar("train/loss", loss.item(), epoch_len * _round + step)
mlflow.log_metric('train/loss', loss.item(), step=epoch_len * _round + step)

lr_scheduler.step()

Expand Down Expand Up @@ -760,8 +766,10 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
writer.add_scalar(
f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1], epoch
)
mlflow.log_metric(f"val_class/acc_{class_names[_c]}", metric[2 * _c] / metric[2 * _c + 1], step=epoch)
except BaseException:
writer.add_scalar(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1], epoch)
mlflow.log_metric(f"val_class/acc_{_c}", metric[2 * _c] / metric[2 * _c + 1], step=epoch)

avg_metric = 0
for _c in range(metric_dim):
Expand All @@ -770,6 +778,7 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
logger.debug(f"avg_metric: {avg_metric}")

writer.add_scalar("val/acc", avg_metric, epoch)
mlflow.log_metric("val/acc", avg_metric, step=epoch)

if torch.cuda.device_count() > 1:
torch.save(model.module.state_dict(), os.path.join(ckpt_path, "current_model.pt"))
Expand Down Expand Up @@ -965,6 +974,8 @@ def run(config_file: Optional[Union[str, Sequence[str]]] = None, **override):
writer.flush()
writer.close()

mlflow.end_run()

if torch.cuda.device_count() == 1 or dist.get_rank() == 0:
if (not valid_at_orig_resolution_only) and es and (_round + 1) < num_rounds:
logger.warning(f"{os.path.basename(bundle_root)} - training: finished with early stop")
Expand Down

0 comments on commit 03a6d4e

Please sign in to comment.