diff --git a/.gitignore b/.gitignore index 409ae1c..5a75546 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ isaac_ws/datasets/** model/* bag_ws/* +training_ws/runs diff --git a/README.md b/README.md index 23f2463..a5c2402 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,17 @@ After the training ends, a `model.pth` file will be available inside `model`. Ad docker compose -f docker/docker-compose.yml --profile training_test up ``` -This will evaluate every image in the `DATASET_NAME` and generate annotated images in the `model` folder. +This will evaluate every image in the `DATASET_NAME` and generate annotated images in the `model/test_output` folder. + +### Training logs visualization + +The logs generated when training a model are stored in the `model/runs` folder and they can be visualized using the profile `training_vis`. This profile runs Tensorboard over `localhost:6006`, and can be accessed via a web browser. To run the Tensorboard server, execute: + +```bash +docker compose -f docker/docker-compose.yml --profile training_vis up +``` + +![Tensorboard dashboard](./doc/tensorboard.png) ## Run diff --git a/doc/tensorboard.png b/doc/tensorboard.png new file mode 100644 index 0000000..d30af72 Binary files /dev/null and b/doc/tensorboard.png differ diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 0c20023..4539bca 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -17,6 +17,7 @@ services: - ../isaac_ws/datasets/${DATASET_NAME}:/root/training_ws/data - ../model/:/root/training_ws/model - torch_cache:/root/.cache/torch + - ../model/runs/:/root/training_ws/runs deploy: resources: reservations: @@ -39,7 +40,7 @@ services: "-d", "./data", "-o", - "./model"] + "./model/test_output"] volumes: - ../isaac_ws/datasets/${DATASET_NAME}:/root/training_ws/data - ../model/:/root/training_ws/model @@ -52,6 +53,17 @@ services: count: 1 capabilities: - gpu + training_log_visualization: + build: + context: .. + dockerfile: docker/training.dockerfile + target: training_prod + container_name: training_vis + profiles: ["training_vis"] + entrypoint: ["tensorboard", "--logdir=/root/training_ws/runs/"] + volumes: + - ../model/runs:/root/training_ws/runs + network_mode: host detection: build: context: .. diff --git a/training_ws/eval.py b/training_ws/eval.py index 3e42e3a..f08aaab 100644 --- a/training_ws/eval.py +++ b/training_ws/eval.py @@ -212,6 +212,8 @@ def decode_output(output, labels_list, threshold=0.05): def main(): """Run inference on dataset and store images with bounding boxes.""" options, args = parse_input() + if not os.path.exists(options.output_folder): + os.makedirs(options.output_folder) dataset = FruitDataset(options.data_dir, get_transform()) validloader = torch.utils.data.DataLoader( diff --git a/training_ws/requirements.txt b/training_ws/requirements.txt index 447ae34..1361138 100644 --- a/training_ws/requirements.txt +++ b/training_ws/requirements.txt @@ -1,3 +1,3 @@ torch==2.4.0 torchvision==0.19.0 -tensorboard==2.17 +tensorboard==2.17.1 diff --git a/training_ws/train.py b/training_ws/train.py index d5544cf..f3a9f6d 100644 --- a/training_ws/train.py +++ b/training_ws/train.py @@ -230,15 +230,18 @@ def main(): options, args = parse_input() dataset = FruitDataset(options.data_dir, get_transform(train=True)) train_size = int(len(dataset) * TRAINING_PARTITION_RATIO) - unused_size = len(dataset) - train_size + valid_size = len(dataset) - train_size - train, unused = torch.utils.data.random_split( + train, valid = torch.utils.data.random_split( dataset, - [train_size, unused_size], + [train_size, valid_size], ) train_loader = torch.utils.data.DataLoader( train, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn ) + valid_loader = torch.utils.data.DataLoader( + valid, batch_size=1, shuffle=True, num_workers=0, collate_fn=collate_fn + ) device = None if torch.cuda.is_available(): @@ -268,13 +271,26 @@ def main(): ] # Format the annotations for model consumption loss_dict = model(imgs, annotations) losses = sum(loss for loss in loss_dict.values()) - writer.add_scalar("Loss/train", losses, epoch) + writer.add_scalar("Loss/Train", losses, epoch) losses.backward() optimizer.step() print(f"Iteration: {i}/{len_dataloader}, Loss: {losses}") + loss_sum = 0 + for imgs, annotations in valid_loader: + imgs = list(img.to(device) for img in imgs) + annotations = [ + {k: v.to(device) for k, v in t.items()} for t in annotations + ] # Format the annotations for model consumption + loss_dict = model(imgs, annotations) + losses = sum(loss for loss in loss_dict.values()) + loss_sum = loss_sum + losses.item() + avg_loss = loss_sum / len(valid_loader) + print(f"Epoch: {epoch}, Validation loss (average): {avg_loss}") + writer.add_scalar("Loss/Validation", avg_loss, epoch) + writer.close() torch.save(model.state_dict(), options.output_file) print(