From 85c696c08b7dbf436749e982346592557729ef25 Mon Sep 17 00:00:00 2001
From: Etienne Dupont <etiennedupont911@gmail.com>
Date: Thu, 6 Jan 2022 10:21:34 +0100
Subject: [PATCH] Add benchmark and training scritps

---
 requirements.txt  |   1 +
 speed_test_hub.py | 103 +++++++++++++++++++++++++++++++
 train_hub.py      | 154 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100644 speed_test_hub.py
 create mode 100644 train_hub.py

diff --git a/requirements.txt b/requirements.txt
index a2c3ed9..b54ac53 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,3 +23,4 @@ tomli==1.2.1
 torch>=1.9.0
 torchvision>=0.10.0
 typing-extensions==3.10.0.2
+hub==2.1.1
\ No newline at end of file
diff --git a/speed_test_hub.py b/speed_test_hub.py
new file mode 100644
index 0000000..2da462c
--- /dev/null
+++ b/speed_test_hub.py
@@ -0,0 +1,103 @@
+import time
+import argparse
+
+import hub
+
+from torch.utils.data.dataloader import DataLoader
+from torchvision.transforms import functional as F
+
+from dataset import get_coco_dataset
+from train_hub import collate_fn, transform_hub_item
+
+# ### Global Variables ###
+# ## Data Fetching ##
+BATCH_SIZE = 5
+NUM_WORKERS = 1
+# ## Training ##
+RATIO_TRAINING_SPLIT = 1
+max_count = 50
+
+
+def test_local(dataset_path: str):
+    train_dataset = get_coco_dataset(dataset_path, train=True)
+    dataloader = DataLoader(
+        train_dataset,
+        batch_size=BATCH_SIZE,
+        num_workers=NUM_WORKERS,
+        shuffle=False,
+        collate_fn=collate_fn,
+    )
+    count = 0
+    before = time.time()
+    for item in dataloader:
+        if count % 10 == 0:
+            print("item #", count)
+        count += 1
+        if count >= max_count:
+            break
+    after = time.time()
+    print(
+        """
+    Regular local dataset loading method run in {}ms/image
+    with config:
+        - number workers = {}
+        - batch size = {}
+    """.format(
+            int(1000 * (after - before) / (max_count * BATCH_SIZE)),
+            NUM_WORKERS,
+            BATCH_SIZE,
+        )
+    )
+
+
+def test_hub(dataset_path: str):
+    ds = hub.load(
+        dataset_path,
+        read_only=True,
+    )
+    dataloader = ds.pytorch(
+        batch_size=BATCH_SIZE,
+        transform=transform_hub_item,
+        collate_fn=collate_fn,
+        num_workers=NUM_WORKERS,
+    )
+    count = 0
+    before = time.time()
+    for item in dataloader:
+        if count % 10 == 0:
+            print("item #", count)
+        count += 1
+        if count >= max_count:
+            break
+    after = time.time()
+    print(
+        """
+    Hub dataset loading method run in {}ms/image
+    with config:
+        - number workers = {}
+        - batch size = {}
+    """.format(
+            int(1000 * (after - before) / (max_count * BATCH_SIZE)),
+            NUM_WORKERS,
+            BATCH_SIZE,
+        )
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compare local dataset loading to remote hub dataset loading")
+    parser.add_argument(
+        "--local-path",
+        dest="local_path",
+        help="path to your local coco dataset directory",
+        required=True,
+    )
+    parser.add_argument(
+        "--hub-path",
+        dest="hub_path",
+        help="path to the same dataset as local-path but hosted on hub e.g. s3://my-bucket/my-dataset",
+        required=True,
+    )
+    args = parser.parse_args()
+    test_local(args.local_path)
+    test_hub(args.hub_path)
diff --git a/train_hub.py b/train_hub.py
new file mode 100644
index 0000000..184c844
--- /dev/null
+++ b/train_hub.py
@@ -0,0 +1,154 @@
+import hub
+import os
+import argparse
+from typing import NoReturn
+
+import torch
+from torch.optim import SGD, lr_scheduler
+from torchvision.transforms import functional as F
+
+from model import get_fasterrcnn_resnet50_fpn
+from engine import train_one_epoch, evaluate
+
+# ### Global Variables ###
+# ## Model ##
+TRAINABLE_BACKBONE_LAYERS = 3
+# ## Data Fetching ##
+BATCH_SIZE = 2
+NUM_WORKERS = 2
+# ## Optimization ##
+LEARNING_RATE = 0.005 * BATCH_SIZE / 4  # Apply linear scaling rule
+MOMENTUM = 0.9
+WEIGHT_DECAY = 0.0005
+STEP_SIZE = 3
+GAMMA = 0.1
+# ## Training ##
+NUM_EPOCHS = 10
+RATIO_TRAINING_SPLIT = 0.8
+# ## Logging ##
+PRINT_FREQUENCY = 10
+
+
+def transform_hub_item(hub_item):
+    """Transforms a sample from Hub into a model input
+
+    Args:
+        hub_item ([type]): expexts "images", "boxes" and "labels" tensors
+
+    Returns:
+        [type]: [description]
+    """
+    image = F.to_tensor(hub_item.get("images"))
+    boxes = torch.tensor(hub_item.get("boxes"))
+    # Convert hub boxes [x,y,w,h] to [x,y,X,Y]
+    boxes[:,2] = boxes[:,0] + boxes[:,2]
+    boxes[:,3] = boxes[:,1] + boxes[:,3]
+    return (
+        image,
+        {
+            "labels": torch.tensor(hub_item.get("labels")),
+            "boxes": boxes,
+        },
+    )
+
+
+def collate_fn(batch):
+    return tuple(zip(*batch))
+
+
+def train(training_dataset_path: str, validation_dataset_path: str) -> NoReturn:
+    """Train an object detection model on the given dataset. The
+    script will store a snapshot of the model after each epoch,
+    containing the model's weights and the mapping between the
+    model's output and the categories. Those snapshots can directly
+    serve as input to the "detect.py" script.
+
+    Args:
+        training_dataset_path (str): path to the hub training dataset directory.
+        validation_dataset_path (str): path to the hub validation dataset directory.
+
+    Returns:
+        NoReturn: [description]
+    """
+    #### FIXME: The following ugly line is needed on MACOS to avoid https://stackoverflow.com/questions/64772335/pytorch-w-parallelnative-cpp206
+    #### This also significantly slows the dataloading process (since it's limited to 1 thread instead of NUM_WORKERS)
+    os.environ["OMP_NUM_THREADS"] = "1"
+    ####
+    training_dataset_hub = hub.load(
+        training_dataset_path,
+        read_only=True,
+    )
+    validation_dataset_hub = hub.load(
+        validation_dataset_path,
+        read_only=True,
+    )
+    model_output_path = os.path.join(
+        "outputs", "models", os.path.basename(training_dataset_path)
+    )
+    os.makedirs(model_output_path, exist_ok=True)
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    print("Running training on device {}".format(device))
+    training_dataloader = training_dataset_hub.pytorch(
+        batch_size=BATCH_SIZE, transform=transform_hub_item, collate_fn=collate_fn
+    )
+    classes = training_dataset_hub.labels.info["class_names"]
+    validation_dataloader = validation_dataset_hub.pytorch(
+        batch_size=BATCH_SIZE, transform=transform_hub_item, collate_fn=collate_fn
+    )
+    model = get_fasterrcnn_resnet50_fpn(
+        trainable_backbone_layers=TRAINABLE_BACKBONE_LAYERS,
+        number_classes=len(classes),
+    )
+    model.to(device)
+
+    # construct an optimizer
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = SGD(
+        params, lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY
+    )
+    # and a learning rate scheduler
+    lr_scheduler_training = lr_scheduler.StepLR(
+        optimizer, step_size=STEP_SIZE, gamma=GAMMA
+    )
+
+    for epoch in range(NUM_EPOCHS):
+        train_one_epoch(
+            model,
+            optimizer,
+            training_dataloader,
+            device,
+            epoch,
+            print_freq=PRINT_FREQUENCY,
+        )
+        torch.save(
+            {
+                "state_dict": model.state_dict(),
+                "categories": classes,
+            },
+            os.path.join(model_output_path, f"epoch_{epoch}.pth"),
+        )
+        lr_scheduler_training.step()
+        # evaluate on the test dataset
+        evaluate(model, validation_dataloader, device=device)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train a model on a hub dataset.")
+    parser.add_argument(
+        "--train-path",
+        dest="train_path",
+        help="path to your hub training dataset e.g. s3://my-bucket/my-training-dataset",
+        required=True,
+    )
+    parser.add_argument(
+        "--val-path",
+        dest="val_path",
+        help="path to your hub validation dataset e.g. s3://my-bucket/my-validation-dataset",
+        required=True,
+    )
+    args = parser.parse_args()
+    train(args.train_path, args.val_path)
+
+
+if __name__ == "__main__":
+    main()