diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..72b4e2e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,60 @@
+# output dir
+# *.png
+# *.json
+# *.jpg
+# compilation and distribution
+# pytorch/python/numpy formats
+# ipython/jupyter notebooks
+# Editor temporaries
+# editor settings
+# project dirs
+# /datasets/*
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ddb76ea
--- /dev/null
+++ b/README.md
@@ -0,0 +1,109 @@
+## OpenDet
+> **Expanding Low-Density Latent Regions for Open-Set Object Detection (CVPR2022)**
+> [Jiaming Han](https://csuhan.com), [Yuqiang Ren](https://github.com/Anymake), [Jian Ding](https://dingjiansw101.github.io), [Xingjia Pan](https://scholar.google.com.hk/citations?user=NaSU3eIAAAAJ&hl=zh-CN), Ke Yan, [Gui-Song Xia](http://www.captain-whu.com/xia_En.html).
+> [arXiv preprint](https://csuhan.com/attaches/cvpr_3605_final.pdf).
+OpenDet2: OpenDet is implemented based on [detectron2](https://github.com/facebookresearch/detectron2).
+### Setup
+The code is based on [detectron2 v0.5](https://github.com/facebookresearch/detectron2/tree/v0.5).
+* **Installation**
+Here is a from-scratch setup script.
+conda create -n opendet2 python=3.8 -y
+conda activate opendet2
+conda install pytorch=1.8.1 torchvision cudatoolkit=10.1 -c pytorch -y
+pip install detectron2==0.5 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
+git clone https://github.com/csuhan/opendet2.git
+cd opendet2
+pip install -v -e .
+* **Prepare datasets**
+Please follow [datasets/README.md](datasets/README.md) for dataset preparation. Then we generate VOC-COCO datasets.
+bash datasets/opendet2_utils/prepare_openset_voc_coco.sh
+# using data splits provided by us.
+cp datasets/voc_coco_ann datasets/voc_coco -rf
+### Model Zoo
+We report the results on VOC and VOC-COCO-20, and provide pretrained models. Please refer to the corresponding log file for full results.
+* **Faster R-CNN**
+| Method | backbone | mAPK↑(VOC) | WI↓ | AOSE↓ | mAPK↑ | APU↑ | Download |
+| FR-CNN | R-50 | 80.06 | 19.50 | 16518 | 58.36 | 0 | [config](configs/faster_rcnn_R_50_FPN_3x_baseline.yaml) [model](https://drive.google.com/drive/folders/10uFOLLCK4N8te08-C-olRyDV-cJ-L6lU?usp=sharing) |
+| PROSER | R-50 | 79.42 | 20.44 | 14266 | 56.72 | 16.99 | [config](configs/faster_rcnn_R_50_FPN_3x_proser.yaml) [model](https://drive.google.com/drive/folders/1_L85gisyvDtBXPe2UbI49vrd5FoBIOI_?usp=sharing) |
+| ORE | R-50 | 79.80 | 18.18 | 12811 | 58.25 | 2.60 | [config]() [model]() |
+| DS | R-50 | 79.70 | 16.76 | 13062 | 58.46 | 8.75 | [config](configs/faster_rcnn_R_50_FPN_3x_ds.yaml) [model](https://drive.google.com/drive/folders/1OWDjL29E2H-_lSApXqM2r8PS7ZvUNtiv?usp=sharing) |
+| OpenDet | R-50 | 80.02 | 12.50 | 10758 | 58.64 | 14.38 | [config](configs/faster_rcnn_R_50_FPN_3x_opendet.yaml) [model](https://drive.google.com/drive/folders/10uFOLLCK4N8te08-C-olRyDV-cJ-L6lU?usp=sharing) |
+| OpenDet | Swin-T | 83.29 | 10.76 | 9149 | 63.42 | 16.35 | [config](configs/faster_rcnn_Swin_T_FPN_3x_opendet.yaml) [model](https://drive.google.com/drive/folders/1j5SkEzeqr0ZnGVVZ4mzXSOvookHfvVvm?usp=sharing) |
+* **RetinaNet**
+| Method | mAPK↑(VOC) | WI↓ | AOSE↓ | mAPK↑ | APU↑ | Download |
+| RetinaNet | 79.63 | 14.16 | 36531 | 57.32 | 0 | [config](configs/retinanet_R_50_FPN_3x_baseline.yaml) [model](https://drive.google.com/drive/folders/15fHfyA2HuXp6LfdTMBuHG6ZwtLcgvD-p?usp=sharing) |
+| Open-RetinaNet | 79.64 | 10.74 | 17208 | 57.32 | 10.55 | [config](configs/retinanet_R_50_FPN_3x_opendet.yaml) [model](https://drive.google.com/drive/folders/1uLRZ5bdGaoORWaP2huiyL_WyLicmWT4G?usp=sharing) |
+* If you cannot access google drive, BaiduYun download link can be found [here](https://pan.baidu.com/s/1I4Pp40pM84aeYTNeGc0kPA) with extracting code ABCD.
+* The above results are reimplemented. Therefore, they are slightly different from our paper.
+* The official code of ORE is at [OWOD](https://github.com/JosephKJ/OWOD). We do not plan to include ORE in our code.
+### Online Demo
+Try our online demo at [huggingface space](https://huggingface.co/spaces/csuhan/opendet2).
+### Train and Test
+* **Testing**
+First, you need to download pretrained weights in the model zoo, e.g., [OpenDet](https://drive.google.com/drive/folders/10uFOLLCK4N8te08-C-olRyDV-cJ-L6lU?usp=sharing).
+Then, run the following command:
+python tools/train_net.py --num-gpus 8 --config-file configs/faster_rcnn_R_50_FPN_3x_opendet.yaml \
+ --eval-only MODEL.WEIGHTS output/faster_rcnn_R_50_FPN_3x_opendet/model_final.pth
+* **Training**
+The training process is the same as detectron2.
+python tools/train_net.py --num-gpus 8 --config-file configs/faster_rcnn_R_50_FPN_3x_opendet.yaml
+To train with the Swin-T backbone, please download [swin_tiny_patch4_window7_224.pth](https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth) and convert it to detectron2's format using [tools/convert_swin_to_d2.py](tools/convert_swin_to_d2.py).
+wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
+python tools/convert_swin_to_d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224_d2.pth
+### Citation
+If you find our work useful for your research, please consider citing:
+ author = {Han, Jiaming and Ren, Yuqiang and Ding, Jian and Pan, Xingjia and Yan, Ke and Xia, Gui-Song},
+ title = {Expanding Low-Density Latent Regions for Open-Set Object Detection},
+ booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2022}
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..0e5bb9c
--- /dev/null
+++ b/app.py
@@ -0,0 +1,56 @@
+Online demo at huggingface.
+The link is: https://huggingface.co/spaces/csuhan/opendet2
+import os
+os.system('pip install torch==1.9 torchvision')
+os.system('pip install detectron2==0.5 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html')
+os.system('pip install timm opencv-python-headless')
+import gradio as gr
+from demo.predictor import VisualizationDemo
+from detectron2.config import get_cfg
+from opendet2 import add_opendet_config
+model_cfgs = {
+ "FR-CNN": ["configs/faster_rcnn_R_50_FPN_3x_baseline.yaml", "frcnn_r50.pth"],
+ "OpenDet-R50": ["configs/faster_rcnn_R_50_FPN_3x_opendet.yaml", "opendet2_r50.pth"],
+ "OpenDet-SwinT": ["configs/faster_rcnn_Swin_T_FPN_18e_opendet_voc.yaml", "opendet2_swint.pth"],
+def setup_cfg(model):
+ cfg = get_cfg()
+ add_opendet_config(cfg)
+ model_cfg = model_cfgs[model]
+ cfg.merge_from_file(model_cfg[0])
+ cfg.MODEL.WEIGHTS = model_cfg[1]
+ cfg.MODEL.DEVICE = "cpu"
+ cfg.freeze()
+ return cfg
+def inference(input, model):
+ cfg = setup_cfg(model)
+ demo = VisualizationDemo(cfg)
+ # use PIL, to be consistent with evaluation
+ predictions, visualized_output = demo.run_on_image(input)
+ output = visualized_output.get_image()[:, :, ::-1]
+ return output
+iface = gr.Interface(
+ inference,
+ [
+ "image",
+ gr.inputs.Radio(
+ ["FR-CNN", "OpenDet-R50", "OpenDet-SwinT"], default='OpenDet-R50'),
+ ],
+ "image")
diff --git a/configs/Base-RCNN-FPN-OPENDET.yaml b/configs/Base-RCNN-FPN-OPENDET.yaml
new file mode 100644
index 0000000..89cc4a4
--- /dev/null
+++ b/configs/Base-RCNN-FPN-OPENDET.yaml
@@ -0,0 +1,25 @@
+_BASE_: "./Base-RCNN-FPN.yaml"
+ MASK_ON: False
+ NAME: "OpenSetStandardROIHeads"
+ NAME: "FastRCNNSeparateConvFCHead"
+ OUTPUT_LAYERS: "OpenDetFastRCNNOutputLayers"
+ SAMPLING_METRIC: "min_score"
+ TOPK: 3
+ ALPHA: 1.0
+ WEIGHT: 1.0
+ OUT_DIM: 128
+ WEIGHT: 0.1
\ No newline at end of file
diff --git a/configs/Base-RCNN-FPN.yaml b/configs/Base-RCNN-FPN.yaml
new file mode 100644
index 0000000..de970a2
--- /dev/null
+++ b/configs/Base-RCNN-FPN.yaml
@@ -0,0 +1,44 @@
+# The same as detectron2/configs/Base-RCNN-FPN.yaml
+ NAME: "build_resnet_fpn_backbone"
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+ FPN:
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
+ SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
+ ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
+ RPN:
+ IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+ PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
+ PRE_NMS_TOPK_TEST: 1000 # Per FPN level
+ # Detectron1 uses 2000 proposals per-batch,
+ # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+ # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+ NAME: "StandardROIHeads"
+ IN_FEATURES: ["p2", "p3", "p4", "p5"]
+ NAME: "FastRCNNConvFCHead"
+ NUM_FC: 2
+ NAME: "MaskRCNNConvUpsampleHead"
+ TRAIN: ("coco_2017_train",)
+ TEST: ("coco_2017_val",)
+ BASE_LR: 0.02
+ STEPS: (60000, 80000)
+ MAX_ITER: 90000
+ MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
diff --git a/configs/Base-RetinaNet.yaml b/configs/Base-RetinaNet.yaml
new file mode 100644
index 0000000..1cefa19
--- /dev/null
+++ b/configs/Base-RetinaNet.yaml
@@ -0,0 +1,26 @@
+# The same as detectron2/configs/Base-RetinaNet.yaml
+ NAME: "build_retinanet_resnet_fpn_backbone"
+ OUT_FEATURES: ["res3", "res4", "res5"]
+ SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
+ FPN:
+ IN_FEATURES: ["res3", "res4", "res5"]
+ IOU_THRESHOLDS: [0.4, 0.5]
+ IOU_LABELS: [0, -1, 1]
+ TRAIN: ("coco_2017_train",)
+ TEST: ("coco_2017_val",)
+ BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate
+ STEPS: (60000, 80000)
+ MAX_ITER: 90000
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/configs/faster_rcnn_R_50_FPN_3x_baseline.yaml b/configs/faster_rcnn_R_50_FPN_3x_baseline.yaml
new file mode 100644
index 0000000..40ed937
--- /dev/null
+++ b/configs/faster_rcnn_R_50_FPN_3x_baseline.yaml
@@ -0,0 +1,16 @@
+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+ WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+ DEPTH: 50
+ OUTPUT_LAYERS: "CosineFastRCNNOutputLayers" # baseline use a simple cosine FRCNN
+ TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+ TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+ STEPS: (21000, 29000)
+ MAX_ITER: 32000
+ AMP:
\ No newline at end of file
diff --git a/configs/faster_rcnn_R_50_FPN_3x_ds.yaml b/configs/faster_rcnn_R_50_FPN_3x_ds.yaml
new file mode 100644
index 0000000..b3d6421
--- /dev/null
+++ b/configs/faster_rcnn_R_50_FPN_3x_ds.yaml
@@ -0,0 +1,18 @@
+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+ WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+ DEPTH: 50
+ NAME: "DropoutStandardROIHeads"
+ OUTPUT_LAYERS: "DropoutFastRCNNOutputLayers"
+ TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+ TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+ STEPS: (21000, 29000)
+ MAX_ITER: 32000
+ AMP:
\ No newline at end of file
diff --git a/configs/faster_rcnn_R_50_FPN_3x_opendet.yaml b/configs/faster_rcnn_R_50_FPN_3x_opendet.yaml
new file mode 100644
index 0000000..006bedb
--- /dev/null
+++ b/configs/faster_rcnn_R_50_FPN_3x_opendet.yaml
@@ -0,0 +1,16 @@
+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+ WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+ DEPTH: 50
+ TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+ TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+ STEPS: (21000, 29000)
+ MAX_ITER: 32000
+ AMP:
+# UPLOSS.WEIGHT: former two are 0.5, the last is 1.0
\ No newline at end of file
diff --git a/configs/faster_rcnn_R_50_FPN_3x_proser.yaml b/configs/faster_rcnn_R_50_FPN_3x_proser.yaml
new file mode 100644
index 0000000..62f4006
--- /dev/null
+++ b/configs/faster_rcnn_R_50_FPN_3x_proser.yaml
@@ -0,0 +1,16 @@
+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+ WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+ DEPTH: 50
+ TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+ TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+ STEPS: (21000, 29000)
+ MAX_ITER: 32000
+ AMP:
\ No newline at end of file
diff --git a/configs/faster_rcnn_Swin_T_FPN_3x_opendet.yaml b/configs/faster_rcnn_Swin_T_FPN_3x_opendet.yaml
new file mode 100644
index 0000000..e3c7607
--- /dev/null
+++ b/configs/faster_rcnn_Swin_T_FPN_3x_opendet.yaml
@@ -0,0 +1,25 @@
+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+ WEIGHTS: "checkpoints/swin_tiny_patch4_window7_224_d2.pth"
+ PIXEL_MEAN: [123.675, 116.28, 103.53]
+ PIXEL_STD: [58.395, 57.12, 57.375]
+ DEPTH: 50
+ NAME: "build_swint_fpn_backbone"
+ OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+ FPN:
+ IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+ TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+ TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+ STEPS: (21000, 29000)
+ MAX_ITER: 32000
+ BASE_LR: 0.0001
+ AMP:
\ No newline at end of file
diff --git a/configs/retinanet_R_50_FPN_3x_baseline.yaml b/configs/retinanet_R_50_FPN_3x_baseline.yaml
new file mode 100644
index 0000000..923de04
--- /dev/null
+++ b/configs/retinanet_R_50_FPN_3x_baseline.yaml
@@ -0,0 +1,17 @@
+_BASE_: "./Base-RetinaNet.yaml"
+ WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+ DEPTH: 50
+ TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+ TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+ STEPS: (21000, 29000)
+ MAX_ITER: 32000
+ AMP:
\ No newline at end of file
diff --git a/configs/retinanet_R_50_FPN_3x_opendet.yaml b/configs/retinanet_R_50_FPN_3x_opendet.yaml
new file mode 100644
index 0000000..af4cc21
--- /dev/null
+++ b/configs/retinanet_R_50_FPN_3x_opendet.yaml
@@ -0,0 +1,25 @@
+_BASE_: "./Base-RetinaNet.yaml"
+ WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+ DEPTH: 50
+ TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+ TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+ STEPS: (21000, 29000)
+ MAX_ITER: 32000
+ AMP:
+ TOPK: 10
+ WEIGHT: 0.2
+ QUEUE_SIZE: 1024
+ WEIGHT: 0.2
\ No newline at end of file
diff --git a/datasets/README.md b/datasets/README.md
new file mode 100644
index 0000000..2cbf4f0
--- /dev/null
+++ b/datasets/README.md
@@ -0,0 +1,51 @@
+# Use Builtin Datasets
+A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
+for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
+This document explains how to setup the builtin datasets so they can be used by the above APIs.
+[Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
+and how to add new datasets to them.
+Detectron2 has builtin support for a few datasets.
+The datasets are assumed to exist in a directory specified by the environment variable
+Under this directory, detectron2 will look for datasets in the structure described below, if needed.
+ coco/
+ VOC20{07,12}/
+You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
+If left unset, the default is `./datasets` relative to your current working directory.
+The [model zoo](https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md)
+contains configs and models that use these builtin datasets.
+## Expected dataset structure for [COCO instance/keypoint detection](https://cocodataset.org/#download):
+ annotations/
+ instances_{train,val}2017.json
+ person_keypoints_{train,val}2017.json
+ {train,val}2017/
+ # image files that are mentioned in the corresponding json
+You can use the 2014 version of the dataset as well.
+Some of the builtin tests (`dev/run_*_tests.sh`) uses a tiny version of the COCO dataset,
+which you can download with `./datasets/prepare_for_tests.sh`.
+## Expected dataset structure for [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/index.html):
+ Annotations/
+ ImageSets/
+ Main/
+ trainval.txt
+ test.txt
+ # train.txt or val.txt, if you use these splits
+ JPEGImages/
diff --git a/datasets/opendet2_utils/convert_coco_to_voc.py b/datasets/opendet2_utils/convert_coco_to_voc.py
new file mode 100644
index 0000000..9a0ed9d
--- /dev/null
+++ b/datasets/opendet2_utils/convert_coco_to_voc.py
@@ -0,0 +1,63 @@
+import xml.etree.cElementTree as ET
+import os
+import argparse
+from tqdm import tqdm
+from pycocotools.coco import COCO
+ "airplane": "aeroplane",
+ "dining table": "diningtable",
+ "motorcycle": "motorbike",
+ "potted plant": "pottedplant",
+ "couch": "sofa",
+ "tv": "tvmonitor",
+def parse_args():
+ parser = argparse.ArgumentParser(description='Convert COCO to VOC style')
+ parser.add_argument("--dir", default="datasets/voc_coco", type=str, help="dataset dir")
+ parser.add_argument("--ann_path", default="datasets/coco/annotations/instances_train2017.json", type=str, help="annotation path")
+ return parser.parse_args()
+def convert_coco_to_voc(coco_annotation_file, target_folder):
+ os.makedirs(os.path.join(target_folder, 'Annotations'), exist_ok=True)
+ coco_instance = COCO(coco_annotation_file)
+ image_ids = []
+ for index, image_id in enumerate(tqdm(coco_instance.imgToAnns)):
+ image_details = coco_instance.imgs[image_id]
+ annotation_el = ET.Element('annotation')
+ ET.SubElement(annotation_el, 'filename').text = image_details['file_name']
+ size_el = ET.SubElement(annotation_el, 'size')
+ ET.SubElement(size_el, 'width').text = str(image_details['width'])
+ ET.SubElement(size_el, 'height').text = str(image_details['height'])
+ ET.SubElement(size_el, 'depth').text = str(3)
+ for annotation in coco_instance.imgToAnns[image_id]:
+ object_el = ET.SubElement(annotation_el, 'object')
+ cls_name = coco_instance.cats[annotation['category_id']]['name']
+ if cls_name in COCO2VOC_CLASS_NAMES.keys():
+ cls_name = COCO2VOC_CLASS_NAMES[cls_name]
+ ET.SubElement(object_el,'name').text = cls_name
+ # ET.SubElement(object_el, 'name').text = 'unknown'
+ ET.SubElement(object_el, 'difficult').text = '0'
+ bb_el = ET.SubElement(object_el, 'bndbox')
+ ET.SubElement(bb_el, 'xmin').text = str(int(annotation['bbox'][0] + 1.0))
+ ET.SubElement(bb_el, 'ymin').text = str(int(annotation['bbox'][1] + 1.0))
+ ET.SubElement(bb_el, 'xmax').text = str(int(annotation['bbox'][0] + annotation['bbox'][2] + 1.0))
+ ET.SubElement(bb_el, 'ymax').text = str(int(annotation['bbox'][1] + annotation['bbox'][3] + 1.0))
+ file_name = image_details['file_name'].split('.')[0]
+ image_ids.append(file_name)
+ ET.ElementTree(annotation_el).write(os.path.join(target_folder, 'Annotations', file_name + '.xml'))
+ imageset_dir = os.path.join(target_folder, 'ImageSets/Main')
+ os.makedirs(imageset_dir, exist_ok=True)
+ imageset_name = os.path.basename(coco_annotation_file).split(".json")[0] + ".txt"
+ with open(os.path.join(imageset_dir, imageset_name), 'w') as f:
+ f.writelines("\n".join(image_ids)+'\n')
+if __name__ == '__main__':
+ args = parse_args()
+ convert_coco_to_voc(args.ann_path, args.dir)
diff --git a/datasets/opendet2_utils/prepare_openset_voc_coco.sh b/datasets/opendet2_utils/prepare_openset_voc_coco.sh
new file mode 100644
index 0000000..829e983
--- /dev/null
+++ b/datasets/opendet2_utils/prepare_openset_voc_coco.sh
@@ -0,0 +1,92 @@
+# make neccesary dirs
+rm $DATA_DIR -rf
+echo "make dirs"
+mkdir -p $DATA_DIR
+mkdir -p $DATA_DIR/Annotations
+# mkdir -p DATA_DIR/JPEGImages
+mkdir -p $DATA_DIR/ImageSets
+mkdir -p $DATA_DIR/ImageSets/Main
+# cp data
+# make use you have $COCO_DIR, VOC07_DIR and VOC12_DIR
+echo "copy coco images"
+cp $COCO_DIR/train2017 $DATA_DIR/JPEGImages -r
+cp $COCO_DIR/val2017/* $DATA_DIR/JPEGImages/
+echo "convert coco annotation to voc"
+python datasets/opendet2_utils/convert_coco_to_voc.py --dir $DATA_DIR --ann_path $COCO_DIR/annotations/instances_train2017.json
+python datasets/opendet2_utils/convert_coco_to_voc.py --dir $DATA_DIR --ann_path $COCO_DIR/annotations/instances_val2017.json
+# generate imageset
+echo "generate coco sub imagesets"
+# class incremental settings
+# 20-40
+python datasets/opendet2_utils/prepare_openset_voc_coco_cls_specific.py --dir $DATA_DIR --in_split instances_train2017 --out_split instances_train2017_cls_spe_20_40 --start_class 20 --end_class 40 --pre_num_sample 8000 --post_num_sample 5000
+# 40-60
+python datasets/opendet2_utils/prepare_openset_voc_coco_cls_specific.py --dir $DATA_DIR --in_split instances_train2017 --out_split instances_train2017_cls_spe_20_60 --start_class 20 --end_class 60 --pre_num_sample 16000 --post_num_sample 10000
+# 60-80
+python datasets/opendet2_utils/prepare_openset_voc_coco_cls_specific.py --dir $DATA_DIR --in_split instances_train2017 --out_split instances_train2017_cls_spe_20_80 --start_class 20 --end_class 80 --pre_num_sample 24000 --post_num_sample 15000
+# image incremental settings
+# 2500
+python datasets/opendet2_utils/prepare_openset_voc_coco_cls_agnostic.py --dir $DATA_DIR --in_split instances_train2017 --out_split instances_train2017_cls_agn_2500 --start_class 20 --end_class 80 --post_num_sample 2500
+# 5000
+python datasets/opendet2_utils/prepare_openset_voc_coco_cls_agnostic.py --dir $DATA_DIR --in_split instances_train2017 --out_split instances_train2017_cls_agn_5000 --start_class 20 --end_class 80 --post_num_sample 5000
+# 10000
+python datasets/opendet2_utils/prepare_openset_voc_coco_cls_agnostic.py --dir $DATA_DIR --in_split instances_train2017 --out_split instances_train2017_cls_agn_10000 --start_class 20 --end_class 80 --post_num_sample 10000
+# 20000
+python datasets/opendet2_utils/prepare_openset_voc_coco_cls_agnostic.py --dir $DATA_DIR --in_split instances_train2017 --out_split instances_train2017_cls_agn_20000 --start_class 20 --end_class 80 --post_num_sample 20000
+echo "copy voc images"
+cp $VOC07_DIR/JPEGImages/* $DATA_DIR/JPEGImages/
+cp $VOC12_DIR/JPEGImages/* $DATA_DIR/JPEGImages/
+echo "copy voc annotation"
+cp $VOC07_DIR/Annotations/* $DATA_DIR/Annotations/
+cp $VOC12_DIR/Annotations/* $DATA_DIR/Annotations/
+echo "copy voc imagesets"
+cp $VOC07_DIR/ImageSets/Main/train.txt $DATA_DIR/ImageSets/Main/voc07train.txt
+cp $VOC07_DIR/ImageSets/Main/val.txt $DATA_DIR/ImageSets/Main/voc07val.txt
+cp $VOC07_DIR/ImageSets/Main/test.txt $DATA_DIR/ImageSets/Main/voc07test.txt
+cp $VOC12_DIR/ImageSets/Main/trainval.txt $DATA_DIR/ImageSets/Main/voc12trainval.txt
+echo "generate voc_coco_val imagesets"
+cat $DATA_DIR/ImageSets/Main/voc07val.txt > $DATA_DIR/ImageSets/Main/voc_coco_val.txt
+cat $DATA_DIR/ImageSets/Main/instances_val2017.txt >> $DATA_DIR/ImageSets/Main/voc_coco_val.txt
+echo "generate voc_coco_20_40_test imagesets"
+cat $DATA_DIR/ImageSets/Main/voc07test.txt > $DATA_DIR/ImageSets/Main/voc_coco_20_40_test.txt
+cat $DATA_DIR/ImageSets/Main/instances_train2017_cls_spe_20_40.txt >> $DATA_DIR/ImageSets/Main/voc_coco_20_40_test.txt
+echo "generate voc_coco_40_60_test imagesets"
+cat $DATA_DIR/ImageSets/Main/voc07test.txt > $DATA_DIR/ImageSets/Main/voc_coco_20_60_test.txt
+cat $DATA_DIR/ImageSets/Main/instances_train2017_cls_spe_20_60.txt >> $DATA_DIR/ImageSets/Main/voc_coco_20_60_test.txt
+echo "generate voc_coco_60_80_test imagesets"
+cat $DATA_DIR/ImageSets/Main/voc07test.txt > $DATA_DIR/ImageSets/Main/voc_coco_20_80_test.txt
+cat $DATA_DIR/ImageSets/Main/instances_train2017_cls_spe_20_80.txt >> $DATA_DIR/ImageSets/Main/voc_coco_20_80_test.txt
+echo "generate voc_coco_2500_test imagesets"
+cat $DATA_DIR/ImageSets/Main/voc07test.txt > $DATA_DIR/ImageSets/Main/voc_coco_2500_test.txt
+cat $DATA_DIR/ImageSets/Main/instances_train2017_cls_agn_2500.txt >> $DATA_DIR/ImageSets/Main/voc_coco_2500_test.txt
+echo "generate voc_coco_5000_test imagesets"
+cat $DATA_DIR/ImageSets/Main/voc07test.txt > $DATA_DIR/ImageSets/Main/voc_coco_5000_test.txt
+cat $DATA_DIR/ImageSets/Main/instances_train2017_cls_agn_5000.txt >> $DATA_DIR/ImageSets/Main/voc_coco_5000_test.txt
+echo "generate voc_coco_10000_test imagesets"
+cat $DATA_DIR/ImageSets/Main/voc07test.txt > $DATA_DIR/ImageSets/Main/voc_coco_10000_test.txt
+cat $DATA_DIR/ImageSets/Main/instances_train2017_cls_agn_10000.txt >> $DATA_DIR/ImageSets/Main/voc_coco_10000_test.txt
+echo "generate voc_coco_20000_test imagesets"
+cat $DATA_DIR/ImageSets/Main/voc07test.txt > $DATA_DIR/ImageSets/Main/voc_coco_20000_test.txt
+cat $DATA_DIR/ImageSets/Main/instances_train2017_cls_agn_20000.txt >> $DATA_DIR/ImageSets/Main/voc_coco_20000_test.txt
diff --git a/datasets/opendet2_utils/prepare_openset_voc_coco_cls_agnostic.py b/datasets/opendet2_utils/prepare_openset_voc_coco_cls_agnostic.py
new file mode 100644
index 0000000..b1feef8
--- /dev/null
+++ b/datasets/opendet2_utils/prepare_openset_voc_coco_cls_agnostic.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+import os
+import itertools
+import xml.etree.ElementTree as ET
+from typing import List, Tuple, Union
+import argparse
+from collections import defaultdict
+import random
+import operator
+from functools import reduce
+from detectron2.utils.file_io import PathManager
+ "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+ "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+ "pottedplant", "sheep", "sofa", "train", "tvmonitor"
+ "truck", "traffic light", "fire hydrant", "stop sign", "parking meter",
+ "bench", "elephant", "bear", "zebra", "giraffe",
+ "backpack", "umbrella", "handbag", "tie", "suitcase",
+ "microwave", "oven", "toaster", "sink", "refrigerator"
+ "frisbee", "skis", "snowboard", "sports ball", "kite",
+ "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
+ "banana", "apple", "sandwich", "orange", "broccoli",
+ "carrot", "hot dog", "pizza", "donut", "cake"
+ "bed", "toilet", "laptop", "mouse",
+ "remote", "keyboard", "cell phone", "book", "clock",
+ "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
+ "wine glass", "cup", "fork", "knife", "spoon", "bowl"
+def parse_args():
+ parser = argparse.ArgumentParser(description='openset voc generator')
+ parser.add_argument("--dir", default="datasets/voc_coco", type=str, help="dataset dir")
+ parser.add_argument("--in_split", default="instances_train2017", type=str, help="in split name")
+ parser.add_argument("--out_split", default="instances_train2017_openset_cls_agn_5000", type=str, help="out split name")
+ parser.add_argument("--start_class", default="20", type=int)
+ parser.add_argument("--end_class", default="40", type=int)
+ parser.add_argument("--post_num_sample", default="5000", type=int)
+ return parser.parse_args()
+def prepare_openset(dirname: str, in_split: str, out_split: str, start_class: int, end_class: int, post_num_sample_img: int):
+ with PathManager.open(os.path.join(dirname, "ImageSets", "Main", in_split + ".txt")) as f:
+ fileids = np.loadtxt(f, dtype=str)
+ annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/"))
+ image_ids = []
+ for fileid in fileids:
+ anno_file = os.path.join(annotation_dirname, fileid + ".xml")
+ with PathManager.open(anno_file) as f:
+ tree = ET.parse(f)
+ classes = [obj.find("name").text for obj in tree.findall("object")]
+ if set(classes).isdisjoint(VOC_COCO_CLASS_NAMES[:start_class]+VOC_COCO_CLASS_NAMES[end_class:]):
+ image_ids.append(fileid)
+ image_ids = set(image_ids)
+ num_img = len(image_ids)
+ # print(num_img)
+ post_num_sample_img = min(num_img, post_num_sample_img)
+ image_ids = random.sample(image_ids, post_num_sample_img)
+ with open(os.path.join(dirname, "ImageSets", "Main", out_split + ".txt"), "w") as f:
+ f.writelines("\n".join(image_ids)+"\n")
+if __name__ == "__main__":
+ args = parse_args()
+ prepare_openset(args.dir, args.in_split, args.out_split, args.start_class, args.end_class, args.post_num_sample)
\ No newline at end of file
diff --git a/datasets/opendet2_utils/prepare_openset_voc_coco_cls_specific.py b/datasets/opendet2_utils/prepare_openset_voc_coco_cls_specific.py
new file mode 100644
index 0000000..74e45bb
--- /dev/null
+++ b/datasets/opendet2_utils/prepare_openset_voc_coco_cls_specific.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+import os
+import itertools
+import xml.etree.ElementTree as ET
+from typing import List, Tuple, Union
+import argparse
+from collections import defaultdict
+import random
+import operator
+from functools import reduce
+from detectron2.utils.file_io import PathManager
+ "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+ "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+ "pottedplant", "sheep", "sofa", "train", "tvmonitor"
+ "truck", "traffic light", "fire hydrant", "stop sign", "parking meter",
+ "bench", "elephant", "bear", "zebra", "giraffe",
+ "backpack", "umbrella", "handbag", "tie", "suitcase",
+ "microwave", "oven", "toaster", "sink", "refrigerator"
+ "frisbee", "skis", "snowboard", "sports ball", "kite",
+ "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
+ "banana", "apple", "sandwich", "orange", "broccoli",
+ "carrot", "hot dog", "pizza", "donut", "cake"
+ "bed", "toilet", "laptop", "mouse",
+ "remote", "keyboard", "cell phone", "book", "clock",
+ "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
+ "wine glass", "cup", "fork", "knife", "spoon", "bowl"
+def parse_args():
+ parser = argparse.ArgumentParser(description='openset voc generator')
+ parser.add_argument("--dir", default="datasets/voc_coco", type=str, help="dataset dir")
+ parser.add_argument("--in_split", default="instances_train2017", type=str, help="in split name")
+ parser.add_argument("--out_split", default="instances_train2017_openset_cls_spe_0_20", type=str, help="out split name")
+ parser.add_argument("--start_class", default="20", type=int)
+ parser.add_argument("--end_class", default="40", type=int)
+ parser.add_argument("--pre_num_sample", default="8000", type=int)
+ parser.add_argument("--post_num_sample", default="5000", type=int)
+ return parser.parse_args()
+def prepare_openset(dirname: str, in_split: str, out_split: str, start_class: int, end_class: int, pre_num_sample_img: int, post_num_sample_img: int):
+ with PathManager.open(os.path.join(dirname, "ImageSets", "Main", in_split + ".txt")) as f:
+ fileids = np.loadtxt(f, dtype=str)
+ annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/"))
+ image_ids = defaultdict(list)
+ for fileid in fileids:
+ anno_file = os.path.join(annotation_dirname, fileid + ".xml")
+ with PathManager.open(anno_file) as f:
+ tree = ET.parse(f)
+ classes = [obj.find("name").text for obj in tree.findall("object")]
+ if (not set(classes).isdisjoint(VOC_COCO_CLASS_NAMES[start_class:end_class])) and "person" not in classes and set(classes).isdisjoint(VOC_COCO_CLASS_NAMES[end_class:]):
+ for cls in classes:
+ image_ids[cls].append(fileid)
+ # count class stastics
+ object_counts = {key:len(image_ids[key]) for key in image_ids.keys()}
+ total_objects = sum([object_counts[key] for key in object_counts.keys()])
+ ratio = float(pre_num_sample_img) / total_objects
+ sample_object_counts = {key:int(ratio*object_counts[key]) for key in object_counts.keys()}
+ sample_image_ids = defaultdict(list)
+ for cls in image_ids.keys():
+ cls_sample_num = sample_object_counts[cls]
+ cls_sample_num = min(cls_sample_num, len(image_ids[cls]))
+ sample_image_ids[cls] = random.sample(image_ids[cls], cls_sample_num)
+ # import pdb;pdb.set_trace()
+ image_ids = set(reduce(operator.add, [x for _, x in sample_image_ids.items()]))
+ post_num_sample_img = min(post_num_sample_img, len(image_ids))
+ image_ids = random.sample(image_ids, post_num_sample_img)
+ with open(os.path.join(dirname, "ImageSets", "Main", out_split + ".txt"), "w") as f:
+ f.writelines("\n".join(image_ids)+"\n")
+if __name__ == "__main__":
+ args = parse_args()
+ prepare_openset(args.dir, args.in_split, args.out_split, args.start_class, args.end_class, args.pre_num_sample, args.post_num_sample)
\ No newline at end of file
diff --git a/datasets/opendet2_utils/split_coco_trainval.py b/datasets/opendet2_utils/split_coco_trainval.py
new file mode 100644
index 0000000..1846e96
--- /dev/null
+++ b/datasets/opendet2_utils/split_coco_trainval.py
@@ -0,0 +1,57 @@
+from pycocotools.coco import COCO
+import numpy as np
+import random
+import operator
+import argparse
+from functools import reduce
+from collections import defaultdict
+import os
+def parse_args():
+ parser = argparse.ArgumentParser(description='openset voc generator')
+ parser.add_argument("--dir", default="datasets/voc_coco/ImageSets/Main", type=str, help="output dir")
+ parser.add_argument("--ann_path", default="datasets/coco/annotations/instances_train2017.json", type=str, help="annotation path")
+ return parser.parse_args()
+def split_coco_trainval(ann_file, out_dir, min_sample_num=10, max_sample_num=50):
+ image_dict = defaultdict(list)
+ coco_instance = COCO(ann_file)
+ for index, image_id in enumerate(coco_instance.imgToAnns):
+ image_details = coco_instance.imgs[image_id]
+ classes = [coco_instance.cats[annotation['category_id']]['name'] for annotation in coco_instance.imgToAnns[image_id]]
+ classes = set(classes)
+ image_name = image_details['file_name'].split('.')[0]
+ for cls in classes:
+ image_dict[cls].append(image_name)
+ for cls in image_dict.keys():
+ image_dict[cls] = list(set(image_dict[cls]))
+ image_train_dict = defaultdict(list)
+ image_val_dict = defaultdict(list)
+ num_arr = [len(image_dict[cls]) for cls in image_dict]
+ min_num = min(num_arr)
+ for cls in image_dict:
+ image_dict_per_cls = image_dict[cls]
+ num_to_sample = int(len(image_dict_per_cls) / min_num * min_sample_num)
+ num_to_sample = min(num_to_sample, max_sample_num)
+ random.shuffle(image_dict_per_cls)
+ image_train_dict[cls].append(image_dict_per_cls[num_to_sample:])
+ image_val_dict[cls].append(image_dict_per_cls[:num_to_sample])
+ image_train_dict = reduce(operator.add, [x[0] for _,x in image_train_dict.items()])
+ image_val_dict = reduce(operator.add, [x[0] for _,x in image_val_dict.items()])
+ image_train_dict = set(image_train_dict)
+ image_val_dict = set(image_val_dict)
+ image_train_dict = [x for x in image_train_dict if x not in image_val_dict]
+ with open(os.path.join(out_dir, "ImageSets/Main", "instances_train2017_train.txt"), "w") as f:
+ f.writelines("\n".join(image_train_dict))
+ with open(os.path.join(out_dir, "ImageSets/Main", "instances_train2017_val.txt"), "w") as f:
+ f.writelines("\n".join(image_val_dict))
+if __name__ == "__main__":
+ args = parse_args()
+ split_coco_trainval(args.ann_path, args.dir)
\ No newline at end of file
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_10000_test.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_10000_test.txt
new file mode 100644
index 0000000..19827dc
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_10000_test.txt
@@ -0,0 +1,14952 @@
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_15000_test.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_15000_test.txt
new file mode 100644
index 0000000..30177c1
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_15000_test.txt
@@ -0,0 +1,4952 @@
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20000_test.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20000_test.txt
new file mode 100644
index 0000000..e2cf3e7
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20000_test.txt
@@ -0,0 +1,24952 @@
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_40_test.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_40_test.txt
new file mode 100644
index 0000000..9289f05
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_40_test.txt
@@ -0,0 +1,9952 @@
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_60_test.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_60_test.txt
new file mode 100644
index 0000000..713771c
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_60_test.txt
@@ -0,0 +1,14284 @@
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_80_test.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_80_test.txt
new file mode 100644
index 0000000..6a8d2d2
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_20_80_test.txt
@@ -0,0 +1,19952 @@
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_2500_test.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_2500_test.txt
new file mode 100644
index 0000000..85eaaa0
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_2500_test.txt
@@ -0,0 +1,7452 @@
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_5000_test.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_5000_test.txt
new file mode 100644
index 0000000..5f0f88a
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_5000_test.txt
@@ -0,0 +1,9952 @@
diff --git a/datasets/voc_coco_ann/ImageSets/Main/voc_coco_val.txt b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_val.txt
new file mode 100644
index 0000000..9d9dd3a
--- /dev/null
+++ b/datasets/voc_coco_ann/ImageSets/Main/voc_coco_val.txt
@@ -0,0 +1,7462 @@
\ No newline at end of file
diff --git a/demo/demo.py b/demo/demo.py
new file mode 100644
index 0000000..d772939
--- /dev/null
+++ b/demo/demo.py
@@ -0,0 +1,194 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import argparse
+import glob
+import multiprocessing as mp
+import numpy as np
+import os
+import tempfile
+import time
+import warnings
+import cv2
+import tqdm
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+from predictor import VisualizationDemo
+import sys
+sys.path.insert(-1, "../")
+from opendet2 import add_opendet_config, builtin, OpenDetTrainer
+# constants
+WINDOW_NAME = "COCO detections"
+def setup_cfg(args):
+ # load config from file and command-line arguments
+ cfg = get_cfg()
+ add_opendet_config(cfg)
+ # To use demo for Panoptic-DeepLab, please uncomment the following two lines.
+ # from detectron2.projects.panoptic_deeplab import add_panoptic_deeplab_config # noqa
+ # add_panoptic_deeplab_config(cfg)
+ cfg.merge_from_file(args.config_file)
+ cfg.merge_from_list(args.opts)
+ # Set score_threshold for builtin models
+ cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
+ cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
+ cfg.freeze()
+ return cfg
+def get_parser():
+ parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+ parser.add_argument(
+ "--config-file",
+ default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+ metavar="FILE",
+ help="path to config file",
+ )
+ parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+ parser.add_argument("--video-input", help="Path to video file.")
+ parser.add_argument(
+ "--input",
+ nargs="+",
+ help="A list of space separated input images; "
+ "or a single glob pattern such as 'directory/*.jpg'",
+ )
+ parser.add_argument(
+ "--output",
+ help="A file or directory to save output visualizations. "
+ "If not given, will show output in an OpenCV window.",
+ )
+ parser.add_argument(
+ "--confidence-threshold",
+ type=float,
+ default=0.5,
+ help="Minimum score for instance predictions to be shown",
+ )
+ parser.add_argument(
+ "--opts",
+ help="Modify config options using the command-line 'KEY VALUE' pairs",
+ default=[],
+ nargs=argparse.REMAINDER,
+ )
+ return parser
+def test_opencv_video_format(codec, file_ext):
+ with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+ filename = os.path.join(dir, "test_file" + file_ext)
+ writer = cv2.VideoWriter(
+ filename=filename,
+ fourcc=cv2.VideoWriter_fourcc(*codec),
+ fps=float(30),
+ frameSize=(10, 10),
+ isColor=True,
+ )
+ [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+ writer.release()
+ if os.path.isfile(filename):
+ return True
+ return False
+if __name__ == "__main__":
+ mp.set_start_method("spawn", force=True)
+ args = get_parser().parse_args()
+ setup_logger(name="fvcore")
+ logger = setup_logger()
+ logger.info("Arguments: " + str(args))
+ cfg = setup_cfg(args)
+ demo = VisualizationDemo(cfg)
+ if args.input:
+ if len(args.input) == 1:
+ args.input = glob.glob(os.path.expanduser(args.input[0]))
+ assert args.input, "The input path(s) was not found"
+ for path in tqdm.tqdm(args.input, disable=not args.output):
+ # use PIL, to be consistent with evaluation
+ img = read_image(path, format="BGR")
+ start_time = time.time()
+ predictions, visualized_output = demo.run_on_image(img)
+ logger.info(
+ "{}: {} in {:.2f}s".format(
+ path,
+ "detected {} instances".format(len(predictions["instances"]))
+ if "instances" in predictions
+ else "finished",
+ time.time() - start_time,
+ )
+ )
+ if args.output:
+ if os.path.isdir(args.output):
+ assert os.path.isdir(args.output), args.output
+ out_filename = os.path.join(args.output, os.path.basename(path))
+ else:
+ assert len(args.input) == 1, "Please specify a directory with args.output"
+ out_filename = args.output
+ visualized_output.save(out_filename)
+ else:
+ cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+ cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+ if cv2.waitKey(0) == 27:
+ break # esc to quit
+ elif args.webcam:
+ assert args.input is None, "Cannot have both --input and --webcam!"
+ assert args.output is None, "output not yet supported with --webcam!"
+ cam = cv2.VideoCapture(0)
+ for vis in tqdm.tqdm(demo.run_on_video(cam)):
+ cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+ cv2.imshow(WINDOW_NAME, vis)
+ if cv2.waitKey(1) == 27:
+ break # esc to quit
+ cam.release()
+ cv2.destroyAllWindows()
+ elif args.video_input:
+ video = cv2.VideoCapture(args.video_input)
+ width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+ height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+ frames_per_second = video.get(cv2.CAP_PROP_FPS)
+ num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+ basename = os.path.basename(args.video_input)
+ codec, file_ext = (
+ ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
+ )
+ if codec == ".mp4v":
+ warnings.warn("x264 codec not available, switching to mp4v")
+ if args.output:
+ if os.path.isdir(args.output):
+ output_fname = os.path.join(args.output, basename)
+ output_fname = os.path.splitext(output_fname)[0] + file_ext
+ else:
+ output_fname = args.output
+ assert not os.path.isfile(output_fname), output_fname
+ output_file = cv2.VideoWriter(
+ filename=output_fname,
+ # some installation of opencv may not support x264 (due to its license),
+ # you can try other format (e.g. MPEG)
+ fourcc=cv2.VideoWriter_fourcc(*codec),
+ fps=float(frames_per_second),
+ frameSize=(width, height),
+ isColor=True,
+ )
+ assert os.path.isfile(args.video_input)
+ for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+ if args.output:
+ output_file.write(vis_frame)
+ else:
+ cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+ cv2.imshow(basename, vis_frame)
+ if cv2.waitKey(1) == 27:
+ break # esc to quit
+ video.release()
+ if args.output:
+ output_file.release()
+ else:
+ cv2.destroyAllWindows()
diff --git a/demo/predictor.py b/demo/predictor.py
new file mode 100644
index 0000000..b2b2cf1
--- /dev/null
+++ b/demo/predictor.py
@@ -0,0 +1,224 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+from detectron2.data.datasets.builtin_meta import _get_coco_instances_meta
+class VisualizationDemo(object):
+ def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+ """
+ Args:
+ cfg (CfgNode):
+ instance_mode (ColorMode):
+ parallel (bool): whether to run the model in different processes from visualization.
+ Useful since the visualization logic can be slow.
+ """
+ self.metadata = MetadataCatalog.get(
+ cfg.DATASETS.TEST[-1] if len(cfg.DATASETS.TEST) else "__unused"
+ )
+ thing_colors = _get_coco_instances_meta()["thing_colors"]
+ thing_colors.append((0,0,0))
+ self.metadata.set(thing_colors=thing_colors)
+ self.cpu_device = torch.device("cpu")
+ self.instance_mode = instance_mode
+ self.parallel = parallel
+ if parallel:
+ num_gpu = torch.cuda.device_count()
+ self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+ else:
+ self.predictor = DefaultPredictor(cfg)
+ def run_on_image(self, image):
+ """
+ Args:
+ image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+ This is the format used by OpenCV.
+ Returns:
+ predictions (dict): the output of the model.
+ vis_output (VisImage): the visualized image output.
+ """
+ vis_output = None
+ predictions = self.predictor(image)
+ # Convert image from OpenCV BGR format to Matplotlib RGB format.
+ image = image[:, :, ::-1]
+ visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+ if "panoptic_seg" in predictions:
+ panoptic_seg, segments_info = predictions["panoptic_seg"]
+ vis_output = visualizer.draw_panoptic_seg_predictions(
+ panoptic_seg.to(self.cpu_device), segments_info
+ )
+ else:
+ if "sem_seg" in predictions:
+ vis_output = visualizer.draw_sem_seg(
+ predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+ )
+ if "instances" in predictions:
+ instances = predictions["instances"].to(self.cpu_device)
+ vis_output = visualizer.draw_instance_predictions(predictions=instances)
+ return predictions, vis_output
+ def _frame_from_video(self, video):
+ while video.isOpened():
+ success, frame = video.read()
+ if success:
+ yield frame
+ else:
+ break
+ def run_on_video(self, video):
+ """
+ Visualizes predictions on frames of the input video.
+ Args:
+ video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+ either a webcam or a video file.
+ Yields:
+ ndarray: BGR visualizations of each video frame.
+ """
+ video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+ def process_predictions(frame, predictions):
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+ if "panoptic_seg" in predictions:
+ panoptic_seg, segments_info = predictions["panoptic_seg"]
+ vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+ frame, panoptic_seg.to(self.cpu_device), segments_info
+ )
+ elif "instances" in predictions:
+ predictions = predictions["instances"].to(self.cpu_device)
+ vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+ elif "sem_seg" in predictions:
+ vis_frame = video_visualizer.draw_sem_seg(
+ frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+ )
+ # Converts Matplotlib RGB format to OpenCV BGR format
+ vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+ return vis_frame
+ frame_gen = self._frame_from_video(video)
+ if self.parallel:
+ buffer_size = self.predictor.default_buffer_size
+ frame_data = deque()
+ for cnt, frame in enumerate(frame_gen):
+ frame_data.append(frame)
+ self.predictor.put(frame)
+ if cnt >= buffer_size:
+ frame = frame_data.popleft()
+ predictions = self.predictor.get()
+ yield process_predictions(frame, predictions)
+ while len(frame_data):
+ frame = frame_data.popleft()
+ predictions = self.predictor.get()
+ yield process_predictions(frame, predictions)
+ else:
+ for frame in frame_gen:
+ yield process_predictions(frame, self.predictor(frame))
+class AsyncPredictor:
+ """
+ A predictor that runs the model asynchronously, possibly on >1 GPUs.
+ Because rendering the visualization takes considerably amount of time,
+ this helps improve throughput a little bit when rendering videos.
+ """
+ class _StopToken:
+ pass
+ class _PredictWorker(mp.Process):
+ def __init__(self, cfg, task_queue, result_queue):
+ self.cfg = cfg
+ self.task_queue = task_queue
+ self.result_queue = result_queue
+ super().__init__()
+ def run(self):
+ predictor = DefaultPredictor(self.cfg)
+ while True:
+ task = self.task_queue.get()
+ if isinstance(task, AsyncPredictor._StopToken):
+ break
+ idx, data = task
+ result = predictor(data)
+ self.result_queue.put((idx, result))
+ def __init__(self, cfg, num_gpus: int = 1):
+ """
+ Args:
+ cfg (CfgNode):
+ num_gpus (int): if 0, will run on CPU
+ """
+ num_workers = max(num_gpus, 1)
+ self.task_queue = mp.Queue(maxsize=num_workers * 3)
+ self.result_queue = mp.Queue(maxsize=num_workers * 3)
+ self.procs = []
+ for gpuid in range(max(num_gpus, 1)):
+ cfg = cfg.clone()
+ cfg.defrost()
+ cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+ self.procs.append(
+ AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+ )
+ self.put_idx = 0
+ self.get_idx = 0
+ self.result_rank = []
+ self.result_data = []
+ for p in self.procs:
+ p.start()
+ atexit.register(self.shutdown)
+ def put(self, image):
+ self.put_idx += 1
+ self.task_queue.put((self.put_idx, image))
+ def get(self):
+ self.get_idx += 1 # the index needed for this request
+ if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+ res = self.result_data[0]
+ del self.result_data[0], self.result_rank[0]
+ return res
+ while True:
+ # make sure the results are returned in the correct order
+ idx, res = self.result_queue.get()
+ if idx == self.get_idx:
+ return res
+ insert = bisect.bisect(self.result_rank, idx)
+ self.result_rank.insert(insert, idx)
+ self.result_data.insert(insert, res)
+ def __len__(self):
+ return self.put_idx - self.get_idx
+ def __call__(self, image):
+ self.put(image)
+ return self.get()
+ def shutdown(self):
+ for _ in self.procs:
+ self.task_queue.put(AsyncPredictor._StopToken())
+ @property
+ def default_buffer_size(self):
+ return len(self.procs) * 5
diff --git a/docs/opendet2.png b/docs/opendet2.png
new file mode 100644
index 0000000..6e65bda
Binary files /dev/null and b/docs/opendet2.png differ
diff --git a/opendet2/__init__.py b/opendet2/__init__.py
new file mode 100644
index 0000000..f63ee6e
--- /dev/null
+++ b/opendet2/__init__.py
@@ -0,0 +1,7 @@
+from .config import *
+from .data import *
+from .engine import *
+from .evaluation import *
+from .modeling import *
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/opendet2/config/__init__.py b/opendet2/config/__init__.py
new file mode 100644
index 0000000..b7617fc
--- /dev/null
+++ b/opendet2/config/__init__.py
@@ -0,0 +1,3 @@
+from .defaults import add_opendet_config
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
\ No newline at end of file
diff --git a/opendet2/config/defaults.py b/opendet2/config/defaults.py
new file mode 100644
index 0000000..9f2097f
--- /dev/null
+++ b/opendet2/config/defaults.py
@@ -0,0 +1,50 @@
+from detectron2.config import CfgNode as CN
+def add_opendet_config(cfg):
+ _C = cfg
+ # unknown probability loss
+ _C.UPLOSS = CN()
+ _C.UPLOSS.START_ITER = 100 # usually the same as warmup iter
+ # instance contrastive loss
+ _C.ICLOSS = CN()
+ # register RoI output layer
+ # known classes
+ # thresh for visualization results.
+ # scale for cosine classifier
+ # swin transformer
+ _C.MODEL.SWINT.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"]
+ _C.MODEL.SWINT.DEPTHS = [2, 2, 6, 2]
+ _C.MODEL.SWINT.NUM_HEADS = [3, 6, 12, 24]
+ # solver, e.g., adamw for swin
+ _C.SOLVER.BETAS = (0.9, 0.999)
diff --git a/opendet2/data/__init__.py b/opendet2/data/__init__.py
new file mode 100644
index 0000000..75e3253
--- /dev/null
+++ b/opendet2/data/__init__.py
@@ -0,0 +1,4 @@
+from .build import *
+from . import builtin
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/opendet2/data/build.py b/opendet2/data/build.py
new file mode 100644
index 0000000..1ad5b9c
--- /dev/null
+++ b/opendet2/data/build.py
@@ -0,0 +1,299 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import numpy as np
+import copy
+import torch.utils.data
+import torch
+from detectron2.config import configurable
+from detectron2.utils.logger import _log_api_usage
+from detectron2.data.catalog import DatasetCatalog, MetadataCatalog
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.detection_utils import check_metadata_consistency
+from detectron2.data.samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
+from detectron2.data.build import trivial_batch_collator
+from detectron2.data import (build_batch_data_loader,
+ print_instances_class_histogram,
+ load_proposals_into_dataset)
+from detectron2.data.build import (filter_images_with_few_keypoints,
+ filter_images_with_only_crowd_annotations)
+This file contains the default logic to build a dataloader for training or testing.
+__all__ = [
+ "build_detection_train_loader",
+ "build_detection_test_loader",
+ "get_detection_dataset_dicts",
+def get_detection_dataset_dicts(names, filter_empty=True, min_keypoints=0, proposal_files=None, cfg=None):
+ """
+ Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+ Args:
+ names (str or list[str]): a dataset name or a list of dataset names
+ filter_empty (bool): whether to filter out images without instance annotations
+ min_keypoints (int): filter out images with fewer keypoints than
+ `min_keypoints`. Set to 0 to do nothing.
+ proposal_files (list[str]): if given, a list of object proposal files
+ that match each dataset in `names`.
+ Returns:
+ list[dict]: a list of dicts following the standard dataset dict format.
+ """
+ if isinstance(names, str):
+ names = [names]
+ assert len(names), names
+ dataset_dicts = [DatasetCatalog.get(dataset_name)
+ for dataset_name in names]
+ for dataset_name, dicts in zip(names, dataset_dicts):
+ assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+ if proposal_files is not None:
+ assert len(names) == len(proposal_files)
+ # load precomputed proposals from proposal files
+ dataset_dicts = [
+ load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+ for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+ ]
+ dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+ has_instances = "annotations" in dataset_dicts[0]
+ if filter_empty and has_instances:
+ dataset_dicts = filter_images_with_only_crowd_annotations(
+ dataset_dicts)
+ if min_keypoints > 0 and has_instances:
+ dataset_dicts = filter_images_with_few_keypoints(
+ dataset_dicts, min_keypoints)
+ d_name = names[0]
+ # if 'voc_coco' in d_name:
+ if 'train' in d_name:
+ dataset_dicts = remove_unk_instances(cfg, dataset_dicts)
+ elif 'test' in d_name:
+ dataset_dicts = label_known_class_and_unknown(cfg, dataset_dicts)
+ if has_instances:
+ try:
+ class_names = MetadataCatalog.get(names[0]).thing_classes
+ check_metadata_consistency("thing_classes", names)
+ print_instances_class_histogram(dataset_dicts, class_names)
+ except AttributeError: # class names are not available for this dataset
+ pass
+ assert len(dataset_dicts), "No valid data found in {}.".format(
+ ",".join(names))
+ return dataset_dicts
+def remove_unk_instances(cfg, dataset_dicts):
+ num_known_classes = cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES
+ valid_classes = range(0, num_known_classes)
+ logger = logging.getLogger(__name__)
+ logger.info("Valid classes: " + str(valid_classes))
+ logger.info("Removing unknown objects...")
+ for entry in copy.copy(dataset_dicts):
+ annos = entry["annotations"]
+ for annotation in copy.copy(annos):
+ if annotation["category_id"] not in valid_classes:
+ annos.remove(annotation)
+ if len(annos) == 0:
+ dataset_dicts.remove(entry)
+ return dataset_dicts
+def label_known_class_and_unknown(cfg, dataset_dicts):
+ num_known_classes = cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES
+ total_num_class = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+ known_classes = range(0, num_known_classes)
+ logger = logging.getLogger(__name__)
+ logger.info("Known classes: " + str(known_classes))
+ logger.info(
+ "Labelling known instances the corresponding label, and unknown instances as unknown...")
+ for entry in dataset_dicts:
+ annos = entry["annotations"]
+ for annotation in annos:
+ if annotation["category_id"] not in known_classes:
+ annotation["category_id"] = total_num_class - 1
+ return dataset_dicts
+def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
+ if dataset is None:
+ dataset = get_detection_dataset_dicts(
+ else 0,
+ cfg=cfg
+ )
+ _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
+ if mapper is None:
+ mapper = DatasetMapper(cfg, True)
+ else:
+ mapper = mapper(cfg, True)
+ if sampler is None:
+ sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+ logger = logging.getLogger(__name__)
+ logger.info("Using training sampler {}".format(sampler_name))
+ if sampler_name == "TrainingSampler":
+ sampler = TrainingSampler(len(dataset))
+ elif sampler_name == "RepeatFactorTrainingSampler":
+ repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
+ )
+ sampler = RepeatFactorTrainingSampler(repeat_factors)
+ else:
+ raise ValueError(
+ "Unknown training sampler: {}".format(sampler_name))
+ return {
+ "dataset": dataset,
+ "sampler": sampler,
+ "mapper": mapper,
+ "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+ "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+ "num_workers": cfg.DATALOADER.NUM_WORKERS,
+ }
+# TODO can allow dataset as an iterable or IterableDataset to make this function more general
+def build_detection_train_loader(
+ dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
+ """
+ Build a dataloader for object detection with some default features.
+ This interface is experimental.
+ Args:
+ dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+ or a map-style pytorch dataset. They can be obtained by using
+ :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+ mapper (callable): a callable which takes a sample (dict) from dataset and
+ returns the format to be consumed by the model.
+ When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+ sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+ indices to be applied on ``dataset``. Default to :class:`TrainingSampler`,
+ which coordinates an infinite random shuffle sequence across all workers.
+ total_batch_size (int): total batch size across all workers. Batching
+ simply puts data into a list.
+ aspect_ratio_grouping (bool): whether to group images with similar
+ aspect ratio for efficiency. When enabled, it requires each
+ element in dataset be a dict with keys "width" and "height".
+ num_workers (int): number of parallel data loading workers
+ Returns:
+ torch.utils.data.DataLoader:
+ a dataloader. Each output from it is a ``list[mapped_element]`` of length
+ ``total_batch_size / num_workers``, where ``mapped_element`` is produced
+ by the ``mapper``.
+ """
+ if isinstance(dataset, list):
+ dataset = DatasetFromList(dataset, copy=False)
+ if mapper is not None:
+ dataset = MapDataset(dataset, mapper)
+ if sampler is None:
+ sampler = TrainingSampler(len(dataset))
+ assert isinstance(sampler, torch.utils.data.sampler.Sampler)
+ return build_batch_data_loader(
+ dataset,
+ sampler,
+ total_batch_size,
+ aspect_ratio_grouping=aspect_ratio_grouping,
+ num_workers=num_workers,
+ )
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+ """
+ Uses the given `dataset_name` argument (instead of the names in cfg), because the
+ standard practice is to evaluate each test set individually (not combining them).
+ """
+ if isinstance(dataset_name, str):
+ dataset_name = [dataset_name]
+ dataset = get_detection_dataset_dicts(
+ dataset_name,
+ filter_empty=False,
+ proposal_files=[
+ cfg.DATASETS.TEST).index(dataset_name)]
+ ]
+ else None,
+ cfg=cfg
+ )
+ if mapper is None:
+ mapper = DatasetMapper(cfg, False)
+ return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS}
+def build_detection_test_loader(dataset, *, mapper, sampler=None, num_workers=0):
+ """
+ Similar to `build_detection_train_loader`, but uses a batch size of 1,
+ and :class:`InferenceSampler`. This sampler coordinates all workers to
+ produce the exact set of all samples.
+ This interface is experimental.
+ Args:
+ dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+ or a map-style pytorch dataset. They can be obtained by using
+ :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+ mapper (callable): a callable which takes a sample (dict) from dataset
+ and returns the format to be consumed by the model.
+ When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+ sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+ indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+ which splits the dataset across all workers.
+ num_workers (int): number of parallel data loading workers
+ Returns:
+ DataLoader: a torch DataLoader, that loads the given detection
+ dataset, with test-time transformation and batching.
+ Examples:
+ ::
+ data_loader = build_detection_test_loader(
+ DatasetRegistry.get("my_test"),
+ mapper=DatasetMapper(...))
+ # or, instantiate with a CfgNode:
+ data_loader = build_detection_test_loader(cfg, "my_test")
+ """
+ if isinstance(dataset, list):
+ dataset = DatasetFromList(dataset, copy=False)
+ if mapper is not None:
+ dataset = MapDataset(dataset, mapper)
+ if sampler is None:
+ sampler = InferenceSampler(len(dataset))
+ # Always use 1 image per worker during inference since this is the
+ # standard when reporting inference time in papers.
+ batch_sampler = torch.utils.data.sampler.BatchSampler(
+ sampler, 1, drop_last=False)
+ data_loader = torch.utils.data.DataLoader(
+ dataset,
+ num_workers=num_workers,
+ batch_sampler=batch_sampler,
+ collate_fn=trivial_batch_collator,
+ )
+ return data_loader
diff --git a/opendet2/data/builtin.py b/opendet2/data/builtin.py
new file mode 100644
index 0000000..b117095
--- /dev/null
+++ b/opendet2/data/builtin.py
@@ -0,0 +1,31 @@
+import os
+from .voc_coco import register_voc_coco
+from detectron2.data import MetadataCatalog
+def register_all_voc_coco(root):
+ SPLITS = [
+ # VOC_COCO_openset
+ ("voc_coco_20_40_test", "voc_coco", "voc_coco_20_40_test"),
+ ("voc_coco_20_60_test", "voc_coco", "voc_coco_20_60_test"),
+ ("voc_coco_20_80_test", "voc_coco", "voc_coco_20_80_test"),
+ ("voc_coco_2500_test", "voc_coco", "voc_coco_2500_test"),
+ ("voc_coco_5000_test", "voc_coco", "voc_coco_5000_test"),
+ ("voc_coco_10000_test", "voc_coco", "voc_coco_10000_test"),
+ ("voc_coco_20000_test", "voc_coco", "voc_coco_20000_test"),
+ ("voc_coco_val", "voc_coco", "voc_coco_val"),
+ ]
+ for name, dirname, split in SPLITS:
+ year = 2007 if "2007" in name else 2012
+ register_voc_coco(name, os.path.join(root, dirname), split, year)
+ MetadataCatalog.get(name).evaluator_type = "pascal_voc"
+if __name__.endswith(".builtin"):
+ # Register them all under "./datasets"
+ _root = os.getenv("DETECTRON2_DATASETS", "datasets")
+ register_all_voc_coco(_root)
diff --git a/opendet2/data/voc_coco.py b/opendet2/data/voc_coco.py
new file mode 100644
index 0000000..4e5b39f
--- /dev/null
+++ b/opendet2/data/voc_coco.py
@@ -0,0 +1,35 @@
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_voc_instances
+ # VOC
+ "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+ "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+ "pottedplant", "sheep", "sofa", "train", "tvmonitor",
+ # COCO-20-40
+ "truck", "traffic light", "fire hydrant", "stop sign", "parking meter",
+ "bench", "elephant", "bear", "zebra", "giraffe",
+ "backpack", "umbrella", "handbag", "tie", "suitcase",
+ "microwave", "oven", "toaster", "sink", "refrigerator",
+ # COCO-40-60
+ "frisbee", "skis", "snowboard", "sports ball", "kite",
+ "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
+ "banana", "apple", "sandwich", "orange", "broccoli",
+ "carrot", "hot dog", "pizza", "donut", "cake",
+ # COCO-60-80
+ "bed", "toilet", "laptop", "mouse",
+ "remote", "keyboard", "cell phone", "book", "clock",
+ "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
+ "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+ # Unknown
+ "unknown",
+def register_voc_coco(name, dirname, split, year):
+ class_names = VOC_COCO_CATEGORIES
+ DatasetCatalog.register(
+ name, lambda: load_voc_instances(dirname, split, class_names))
+ MetadataCatalog.get(name).set(
+ thing_classes=list(class_names), dirname=dirname, year=year, split=split
+ )
diff --git a/opendet2/engine/__init__.py b/opendet2/engine/__init__.py
new file mode 100644
index 0000000..3912be5
--- /dev/null
+++ b/opendet2/engine/__init__.py
@@ -0,0 +1,3 @@
+from .defaults import OpenDetTrainer
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/opendet2/engine/defaults.py b/opendet2/engine/defaults.py
new file mode 100644
index 0000000..b7ae381
--- /dev/null
+++ b/opendet2/engine/defaults.py
@@ -0,0 +1,441 @@
+import logging
+import os
+import weakref
+from collections import OrderedDict
+from typing import Dict
+import torch
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.engine import (AMPTrainer, SimpleTrainer,
+ TrainerBase, create_ddp_model, hooks, create_ddp_model, default_writers)
+from detectron2.evaluation import (DatasetEvaluator, DatasetEvaluators,
+ inference_on_dataset, print_csv_format,
+ verify_results)
+from detectron2.modeling import GeneralizedRCNNWithTTA, build_model
+from detectron2.solver import build_lr_scheduler
+from detectron2.utils import comm
+from detectron2.utils.logger import setup_logger
+from fvcore.nn.precise_bn import get_bn_modules
+from ..data import build_detection_test_loader, build_detection_train_loader
+from ..evaluation import PascalVOCDetectionEvaluator
+from ..solver import build_optimizer
+class OpenDetTrainer(TrainerBase):
+ """
+ A trainer with default training logic. It does the following:
+ 1. Create a :class:`SimpleTrainer` using model, optimizer, dataloader
+ defined by the given config. Create a LR scheduler defined by the config.
+ 2. Load the last checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when
+ `resume_or_load` is called.
+ 3. Register a few common hooks defined by the config.
+ It is created to simplify the **standard model training workflow** and reduce code boilerplate
+ for users who only need the standard training workflow, with standard features.
+ It means this class makes *many assumptions* about your training logic that
+ may easily become invalid in a new research. In fact, any assumptions beyond those made in the
+ :class:`SimpleTrainer` are too much for research.
+ The code of this class has been annotated about restrictive assumptions it makes.
+ When they do not work for you, you're encouraged to:
+ 1. Overwrite methods of this class, OR:
+ 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
+ nothing else. You can then add your own hooks if needed. OR:
+ 3. Write your own training loop similar to `tools/plain_train_net.py`.
+ See the :doc:`/tutorials/training` tutorials for more details.
+ Note that the behavior of this class, like other functions/classes in
+ this file, is not stable, since it is meant to represent the "common default behavior".
+ It is only guaranteed to work well with the standard models and training workflow in detectron2.
+ To obtain more stable behavior, write your own training logic with other public APIs.
+ Examples:
+ ::
+ trainer = DefaultTrainer(cfg)
+ trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS
+ trainer.train()
+ Attributes:
+ scheduler:
+ checkpointer (DetectionCheckpointer):
+ cfg (CfgNode):
+ """
+ def __init__(self, cfg):
+ """
+ Args:
+ cfg (CfgNode):
+ """
+ super().__init__()
+ logger = logging.getLogger("detectron2")
+ # setup_logger is not called for d2
+ if not logger.isEnabledFor(logging.INFO):
+ setup_logger()
+ cfg = OpenDetTrainer.auto_scale_workers(cfg, comm.get_world_size())
+ # Assume these objects must be constructed in this order.
+ model = self.build_model(cfg)
+ optimizer = self.build_optimizer(cfg, model)
+ data_loader = self.build_train_loader(cfg)
+ model = create_ddp_model(
+ model, broadcast_buffers=False, find_unused_parameters=True)
+ # model = create_ddp_model(model, broadcast_buffers=False)
+ self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
+ model, data_loader, optimizer
+ )
+ self.scheduler = self.build_lr_scheduler(cfg, optimizer)
+ self.checkpointer = DetectionCheckpointer(
+ # Assume you want to save checkpoints together with logs/statistics
+ model,
+ trainer=weakref.proxy(self),
+ )
+ self.start_iter = 0
+ self.max_iter = cfg.SOLVER.MAX_ITER
+ self.cfg = cfg
+ self.register_hooks(self.build_hooks())
+ def resume_or_load(self, resume=True):
+ """
+ If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by
+ a `last_checkpoint` file), resume from the file. Resuming means loading all
+ available states (eg. optimizer and scheduler) and update iteration counter
+ from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used.
+ Otherwise, this is considered as an independent training. The method will load model
+ weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start
+ from iteration 0.
+ Args:
+ resume (bool): whether to do resume or not
+ """
+ self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume)
+ if resume and self.checkpointer.has_checkpoint():
+ # The checkpoint stores the training iteration that just finished, thus we start
+ # at the next iteration
+ self.start_iter = self.iter + 1
+ def build_hooks(self):
+ """
+ Build a list of default hooks, including timing, evaluation,
+ checkpointing, lr scheduling, precise BN, writing events.
+ Returns:
+ list[HookBase]:
+ """
+ cfg = self.cfg.clone()
+ cfg.defrost()
+ cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN
+ ret = [
+ hooks.IterationTimer(),
+ hooks.LRScheduler(),
+ hooks.PreciseBN(
+ # Run at the same freq as (but before) evaluation.
+ self.model,
+ # Build a new data loader to not affect training
+ self.build_train_loader(cfg),
+ )
+ if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+ else None,
+ ]
+ # Do PreciseBN before checkpointer, because it updates the model and need to
+ # be saved by checkpointer.
+ # This is not always the best: if checkpointing has a different frequency,
+ # some checkpoints may have more precise statistics than others.
+ if comm.is_main_process():
+ ret.append(hooks.PeriodicCheckpointer(
+ self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
+ def test_and_save_results():
+ self._last_eval_results = self.test(self.cfg, self.model)
+ return self._last_eval_results
+ # Do evaluation after checkpointer, because then if it fails,
+ # we can use the saved checkpoint to debug.
+ ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+ if comm.is_main_process():
+ # Here the default print/log frequency of each writer is used.
+ # run writers in the end, so that evaluation metrics are written
+ ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
+ return ret
+ def build_writers(self):
+ """
+ Build a list of writers to be used using :func:`default_writers()`.
+ If you'd like a different list of writers, you can overwrite it in
+ your trainer.
+ Returns:
+ list[EventWriter]: a list of :class:`EventWriter` objects.
+ """
+ return default_writers(self.cfg.OUTPUT_DIR, self.max_iter)
+ def train(self):
+ """
+ Run training.
+ Returns:
+ OrderedDict of results, if evaluation is enabled. Otherwise None.
+ """
+ super().train(self.start_iter, self.max_iter)
+ if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process():
+ assert hasattr(
+ self, "_last_eval_results"
+ ), "No evaluation results obtained during training!"
+ verify_results(self.cfg, self._last_eval_results)
+ return self._last_eval_results
+ def run_step(self):
+ self._trainer.iter = self.iter
+ self._trainer.run_step()
+ @classmethod
+ def build_model(cls, cfg):
+ """
+ Returns:
+ torch.nn.Module:
+ It now calls :func:`detectron2.modeling.build_model`.
+ Overwrite it if you'd like a different model.
+ """
+ model = build_model(cfg)
+ logger = logging.getLogger(__name__)
+ logger.info("Model:\n{}".format(model))
+ return model
+ @classmethod
+ def build_optimizer(cls, cfg, model):
+ """
+ Returns:
+ torch.optim.Optimizer:
+ It now calls :func:`detectron2.solver.build_optimizer`.
+ Overwrite it if you'd like a different optimizer.
+ """
+ return build_optimizer(cfg, model)
+ @classmethod
+ def build_lr_scheduler(cls, cfg, optimizer):
+ """
+ It now calls :func:`detectron2.solver.build_lr_scheduler`.
+ Overwrite it if you'd like a different scheduler.
+ """
+ return build_lr_scheduler(cfg, optimizer)
+ @classmethod
+ def build_train_loader(cls, cfg):
+ """
+ Returns:
+ iterable
+ It now calls :func:`detectron2.data.build_detection_train_loader`.
+ Overwrite it if you'd like a different data loader.
+ """
+ return build_detection_train_loader(cfg)
+ @classmethod
+ def build_test_loader(cls, cfg, dataset_name):
+ """
+ Returns:
+ iterable
+ It now calls :func:`detectron2.data.build_detection_test_loader`.
+ Overwrite it if you'd like a different data loader.
+ """
+ return build_detection_test_loader(cfg, dataset_name)
+ @classmethod
+ def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+ if output_folder is None:
+ output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+ evaluator_list = []
+ evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+ if evaluator_type == "pascal_voc":
+ return PascalVOCDetectionEvaluator(dataset_name, cfg)
+ if len(evaluator_list) == 0:
+ raise NotImplementedError(
+ "no Evaluator for the dataset {} with the type {}".format(
+ dataset_name, evaluator_type
+ )
+ )
+ elif len(evaluator_list) == 1:
+ return evaluator_list[0]
+ return DatasetEvaluators(evaluator_list)
+ @classmethod
+ def test_with_TTA(cls, cfg, model):
+ logger = logging.getLogger("detectron2.trainer")
+ # In the end of training, run an evaluation with TTA
+ # Only support some R-CNN models.
+ logger.info("Running inference with test-time augmentation ...")
+ model = GeneralizedRCNNWithTTA(cfg, model)
+ evaluators = [
+ cls.build_evaluator(
+ cfg, name, output_folder=os.path.join(
+ cfg.OUTPUT_DIR, "inference_TTA")
+ )
+ for name in cfg.DATASETS.TEST
+ ]
+ res = cls.test(cfg, model, evaluators)
+ res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+ return res
+ @classmethod
+ def test(cls, cfg, model, evaluators=None):
+ """
+ Args:
+ cfg (CfgNode):
+ model (nn.Module):
+ evaluators (list[DatasetEvaluator] or None): if None, will call
+ :meth:`build_evaluator`. Otherwise, must have the same length as
+ ``cfg.DATASETS.TEST``.
+ Returns:
+ dict: a dict of result metrics
+ """
+ logger = logging.getLogger(__name__)
+ if isinstance(evaluators, DatasetEvaluator):
+ evaluators = [evaluators]
+ if evaluators is not None:
+ assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
+ len(cfg.DATASETS.TEST), len(evaluators)
+ )
+ results = OrderedDict()
+ for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+ data_loader = cls.build_test_loader(cfg, dataset_name)
+ # When evaluators are passed in as arguments,
+ # implicitly assume that evaluators can be created before data_loader.
+ if evaluators is not None:
+ evaluator = evaluators[idx]
+ else:
+ try:
+ evaluator = cls.build_evaluator(cfg, dataset_name)
+ except NotImplementedError:
+ logger.warn(
+ "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+ "or implement its `build_evaluator` method."
+ )
+ results[dataset_name] = {}
+ continue
+ results_i = inference_on_dataset(model, data_loader, evaluator)
+ results[dataset_name] = results_i
+ if comm.is_main_process():
+ assert isinstance(
+ results_i, dict
+ ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+ results_i
+ )
+ logger.info(
+ "Evaluation results for {} in csv format:".format(dataset_name))
+ print_csv_format(results_i)
+ if len(results) == 1:
+ results = list(results.values())[0]
+ return results
+ @staticmethod
+ def auto_scale_workers(cfg, num_workers: int):
+ """
+ When the config is defined for certain number of workers (according to
+ ``cfg.SOLVER.REFERENCE_WORLD_SIZE``) that's different from the number of
+ workers currently in use, returns a new cfg where the total batch size
+ is scaled so that the per-GPU batch size stays the same as the
+ Other config options are also scaled accordingly:
+ * training steps and warmup steps are scaled inverse proportionally.
+ * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`.
+ For example, with the original config like the following:
+ .. code-block:: yaml
+ BASE_LR: 0.1
+ MAX_ITER: 5000
+ STEPS: (4000,)
+ When this config is used on 16 GPUs instead of the reference number 8,
+ calling this method will return a new config with:
+ .. code-block:: yaml
+ BASE_LR: 0.2
+ MAX_ITER: 2500
+ STEPS: (2000,)
+ Note that both the original config and this new config can be trained on 16 GPUs.
+ It's up to user whether to enable this feature (by setting ``REFERENCE_WORLD_SIZE``).
+ Returns:
+ CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``.
+ """
+ old_world_size = cfg.SOLVER.REFERENCE_WORLD_SIZE
+ if old_world_size == 0 or old_world_size == num_workers:
+ return cfg
+ cfg = cfg.clone()
+ frozen = cfg.is_frozen()
+ cfg.defrost()
+ assert (
+ cfg.SOLVER.IMS_PER_BATCH % old_world_size == 0
+ ), "Invalid REFERENCE_WORLD_SIZE in config!"
+ scale = num_workers / old_world_size
+ bs = cfg.SOLVER.IMS_PER_BATCH = int(
+ round(cfg.SOLVER.IMS_PER_BATCH * scale))
+ lr = cfg.SOLVER.BASE_LR = cfg.SOLVER.BASE_LR * scale
+ max_iter = cfg.SOLVER.MAX_ITER = int(
+ round(cfg.SOLVER.MAX_ITER / scale))
+ warmup_iter = cfg.SOLVER.WARMUP_ITERS = int(
+ round(cfg.SOLVER.WARMUP_ITERS / scale))
+ cfg.SOLVER.STEPS = tuple(int(round(s / scale))
+ for s in cfg.SOLVER.STEPS)
+ cfg.TEST.EVAL_PERIOD = int(round(cfg.TEST.EVAL_PERIOD / scale))
+ round(cfg.SOLVER.CHECKPOINT_PERIOD / scale))
+ cfg.SOLVER.REFERENCE_WORLD_SIZE = num_workers # maintain invariant
+ logger = logging.getLogger(__name__)
+ logger.info(
+ f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, "
+ f"max_iter={max_iter}, warmup={warmup_iter}."
+ )
+ if frozen:
+ cfg.freeze()
+ return cfg
+# Access basic attributes from the underlying trainer
+for _attr in ["model", "data_loader", "optimizer"]:
+ setattr(
+ OpenDetTrainer,
+ _attr,
+ property(
+ # getter
+ lambda self, x=_attr: getattr(self._trainer, x),
+ # setter
+ lambda self, value, x=_attr: setattr(self._trainer, x, value),
+ ),
+ )
diff --git a/opendet2/evaluation/__init__.py b/opendet2/evaluation/__init__.py
new file mode 100644
index 0000000..9925466
--- /dev/null
+++ b/opendet2/evaluation/__init__.py
@@ -0,0 +1,3 @@
+from .pascal_voc_evaluation import *
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/opendet2/evaluation/pascal_voc_evaluation.py b/opendet2/evaluation/pascal_voc_evaluation.py
new file mode 100644
index 0000000..efd1fc3
--- /dev/null
+++ b/opendet2/evaluation/pascal_voc_evaluation.py
@@ -0,0 +1,377 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Code is modified from https://github.com/JosephKJ/OWOD
+import logging
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+from collections import OrderedDict, defaultdict
+from functools import lru_cache
+from tabulate import tabulate
+import numpy as np
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.evaluation.pascal_voc_evaluation import voc_ap
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+class PascalVOCDetectionEvaluator(DatasetEvaluator):
+ def __init__(self, dataset_name, cfg=None):
+ """
+ Args:
+ dataset_name (str): name of the dataset, e.g., "voc_2007_test"
+ """
+ self._dataset_name = dataset_name
+ meta = MetadataCatalog.get(dataset_name)
+ # Too many tiny files, download all to local for speed.
+ annotation_dir_local = PathManager.get_local_path(
+ os.path.join(meta.dirname, "Annotations/")
+ )
+ self._anno_file_template = os.path.join(annotation_dir_local, "{}.xml")
+ self._image_set_path = os.path.join(
+ meta.dirname, "ImageSets", "Main", meta.split + ".txt")
+ self._class_names = meta.thing_classes
+ assert meta.year in [2007, 2012], meta.year
+ self.logger = logging.getLogger(__name__)
+ self._is_2007 = meta.year == 2007
+ self._cpu_device = torch.device("cpu")
+ if cfg is not None:
+ self.output_dir = cfg.OUTPUT_DIR
+ self.total_num_class = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+ self.unknown_class_index = self.total_num_class - 1
+ self.num_known_classes = cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES
+ self.known_classes = self._class_names[:self.num_known_classes]
+ def reset(self):
+ # class name -> list of prediction strings
+ self._predictions = defaultdict(list)
+ def process(self, inputs, outputs):
+ for input, output in zip(inputs, outputs):
+ image_id = input["image_id"]
+ instances = output["instances"].to(self._cpu_device)
+ boxes = instances.pred_boxes.tensor.numpy()
+ scores = instances.scores.tolist()
+ classes = instances.pred_classes.tolist()
+ for box, score, cls in zip(boxes, scores, classes):
+ xmin, ymin, xmax, ymax = box
+ # The inverse of data loading logic in `datasets/pascal_voc.py`
+ xmin += 1
+ ymin += 1
+ self._predictions[cls].append(
+ f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}"
+ )
+ def compute_WI_at_many_recall_level(self, recalls, tp_plus_fp_cs, fp_os):
+ wi_at_recall = {}
+ # for r in range(1, 10):
+ for r in [8]:
+ r = r/10
+ wi = self.compute_WI_at_a_recall_level(
+ recalls, tp_plus_fp_cs, fp_os, recall_level=r)
+ wi_at_recall[r] = wi
+ return wi_at_recall
+ def compute_WI_at_a_recall_level(self, recalls, tp_plus_fp_cs, fp_os, recall_level=0.5):
+ wi_at_iou = {}
+ for iou, recall in recalls.items():
+ tp_plus_fps = []
+ fps = []
+ for cls_id, rec in enumerate(recall):
+ if cls_id in range(self.num_known_classes) and len(rec) > 0:
+ index = min(range(len(rec)), key=lambda i: abs(
+ rec[i] - recall_level))
+ tp_plus_fp = tp_plus_fp_cs[iou][cls_id][index]
+ tp_plus_fps.append(tp_plus_fp)
+ fp = fp_os[iou][cls_id][index]
+ fps.append(fp)
+ if len(tp_plus_fps) > 0:
+ wi_at_iou[iou] = np.mean(fps) / np.mean(tp_plus_fps)
+ else:
+ wi_at_iou[iou] = 0
+ return wi_at_iou
+ def evaluate(self):
+ """
+ Returns:
+ dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
+ """
+ all_predictions = comm.gather(self._predictions, dst=0)
+ if not comm.is_main_process():
+ return
+ predictions = defaultdict(list)
+ for predictions_per_rank in all_predictions:
+ for clsid, lines in predictions_per_rank.items():
+ predictions[clsid].extend(lines)
+ del all_predictions
+ self.logger.info(
+ "Evaluating {} using {} metric. "
+ "Note that results do not use the official Matlab API.".format(
+ self._dataset_name, 2007 if self._is_2007 else 2012
+ )
+ )
+ dirname = os.path.join(self.output_dir, 'pascal_voc_eval')
+ if not os.path.exists(dirname):
+ os.mkdir(dirname)
+ # with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
+ res_file_template = os.path.join(dirname, "{}.txt")
+ aps = defaultdict(list) # iou -> ap per class
+ recs = defaultdict(list)
+ precs = defaultdict(list)
+ all_recs = defaultdict(list)
+ all_precs = defaultdict(list)
+ unk_det_as_knowns = defaultdict(list)
+ num_unks = defaultdict(list)
+ tp_plus_fp_cs = defaultdict(list)
+ fp_os = defaultdict(list)
+ for cls_id, cls_name in enumerate(self._class_names):
+ lines = predictions.get(cls_id, [""])
+ with open(res_file_template.format(cls_name), "w") as f:
+ f.write("\n".join(lines))
+ for thresh in [50, ]:
+ # for thresh in range(50, 100, 5):
+ (rec, prec, ap, unk_det_as_known, num_unk,
+ tp_plus_fp_closed_set, fp_open_set) = voc_eval(
+ res_file_template,
+ self._anno_file_template,
+ self._image_set_path,
+ cls_name,
+ ovthresh=thresh / 100.0,
+ use_07_metric=self._is_2007,
+ known_classes=self.known_classes
+ )
+ aps[thresh].append(ap * 100)
+ unk_det_as_knowns[thresh].append(unk_det_as_known)
+ num_unks[thresh].append(num_unk)
+ all_precs[thresh].append(prec)
+ all_recs[thresh].append(rec)
+ tp_plus_fp_cs[thresh].append(tp_plus_fp_closed_set)
+ fp_os[thresh].append(fp_open_set)
+ try:
+ recs[thresh].append(rec[-1] * 100)
+ precs[thresh].append(prec[-1] * 100)
+ except:
+ recs[thresh].append(0)
+ precs[thresh].append(0)
+ results_2d = {}
+ mAP = {iou: np.mean(x) for iou, x in aps.items()}
+ results_2d['mAP'] = mAP[50]
+ wi = self.compute_WI_at_many_recall_level(
+ all_recs, tp_plus_fp_cs, fp_os)
+ results_2d['WI'] = wi[0.8][50] * 100
+ total_num_unk_det_as_known = {iou: np.sum(
+ x) for iou, x in unk_det_as_knowns.items()}
+ # total_num_unk = num_unks[50][0]
+ # self.logger.info('num_unk ' + str(total_num_unk))
+ results_2d['AOSE'] = total_num_unk_det_as_known[50]
+ # class-wise P-R
+ # self.logger.info(self._class_names)
+ # self.logger.info("AP50: " + str(['%.1f' % x for x in aps[50]]))
+ # self.logger.info("P50: " + str(['%.1f' % x for x in precs[50]]))
+ # self.logger.info("R50: " + str(['%.1f' % x for x in recs[50]]))
+ # Known
+ results_2d.update({
+ "AP@K": np.mean(aps[50][:self.num_known_classes]),
+ "P@K": np.mean(precs[50][:self.num_known_classes]),
+ "R@K": np.mean(recs[50][:self.num_known_classes]),
+ })
+ # Unknown
+ results_2d.update({
+ "AP@U": np.mean(aps[50][-1]),
+ "P@U": np.mean(precs[50][-1]),
+ "R@U": np.mean(recs[50][-1]),
+ })
+ results_head = list(results_2d.keys())
+ results_data = [[float(results_2d[k]) for k in results_2d]]
+ table = tabulate(
+ results_data,
+ tablefmt="pipe",
+ floatfmt=".2f",
+ headers=results_head,
+ numalign="left",
+ )
+ self.logger.info("\n" + table)
+ return {",".join(results_head): ",".join([str(round(x,2)) for x in results_data[0]])}
+def parse_rec(filename, known_classes):
+ """Parse a PASCAL VOC xml file."""
+ with PathManager.open(filename) as f:
+ tree = ET.parse(f)
+ objects = []
+ for obj in tree.findall("object"):
+ obj_struct = {}
+ cls_name = obj.find("name").text
+ # translate unseen classes to unknown
+ if cls_name not in known_classes:
+ cls_name = 'unknown'
+ obj_struct["name"] = cls_name
+ # obj_struct["pose"] = obj.find("pose").text
+ # obj_struct["truncated"] = int(obj.find("truncated").text)
+ obj_struct["difficult"] = int(obj.find("difficult").text)
+ bbox = obj.find("bndbox")
+ obj_struct["bbox"] = [
+ int(bbox.find("xmin").text),
+ int(bbox.find("ymin").text),
+ int(bbox.find("xmax").text),
+ int(bbox.find("ymax").text),
+ ]
+ objects.append(obj_struct)
+ return objects
+def compute_overlaps(BBGT, bb):
+ # compute overlaps
+ # intersection
+ ixmin = np.maximum(BBGT[:, 0], bb[0])
+ iymin = np.maximum(BBGT[:, 1], bb[1])
+ ixmax = np.minimum(BBGT[:, 2], bb[2])
+ iymax = np.minimum(BBGT[:, 3], bb[3])
+ iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+ ih = np.maximum(iymax - iymin + 1.0, 0.0)
+ inters = iw * ih
+ # union
+ uni = (
+ (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+ + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
+ - inters
+ )
+ return inters / uni
+def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False, known_classes=None):
+ # first load gt
+ # read list of images
+ with PathManager.open(imagesetfile, "r") as f:
+ lines = f.readlines()
+ imagenames = [x.strip() for x in lines]
+ # load annots
+ recs = {}
+ for imagename in imagenames:
+ recs[imagename] = parse_rec(
+ annopath.format(imagename), tuple(known_classes))
+ # extract gt objects for this class
+ class_recs = {}
+ npos = 0
+ for imagename in imagenames:
+ R = [obj for obj in recs[imagename] if obj["name"] == classname]
+ bbox = np.array([x["bbox"] for x in R])
+ difficult = np.array([x["difficult"] for x in R]).astype(np.bool)
+ # difficult = np.array([False for x in R]).astype(np.bool) # treat all "difficult" as GT
+ det = [False] * len(R)
+ npos = npos + sum(~difficult)
+ class_recs[imagename] = {"bbox": bbox,
+ "difficult": difficult, "det": det}
+ # read dets
+ detfile = detpath.format(classname)
+ with open(detfile, "r") as f:
+ lines = f.readlines()
+ splitlines = [x.strip().split(" ") for x in lines]
+ image_ids = [x[0] for x in splitlines]
+ confidence = np.array([float(x[1]) for x in splitlines])
+ BB = np.array([[float(z) for z in x[2:]]
+ for x in splitlines]).reshape(-1, 4)
+ # sort by confidence
+ sorted_ind = np.argsort(-confidence)
+ BB = BB[sorted_ind, :]
+ image_ids = [image_ids[x] for x in sorted_ind]
+ # go down dets and mark TPs and FPs
+ nd = len(image_ids)
+ tp = np.zeros(nd)
+ fp = np.zeros(nd)
+ for d in range(nd):
+ R = class_recs[image_ids[d]]
+ bb = BB[d, :].astype(float)
+ ovmax = -np.inf
+ BBGT = R["bbox"].astype(float)
+ if BBGT.size > 0:
+ overlaps = compute_overlaps(BBGT, bb)
+ ovmax = np.max(overlaps)
+ jmax = np.argmax(overlaps)
+ if ovmax > ovthresh:
+ if not R["difficult"][jmax]:
+ if not R["det"][jmax]:
+ tp[d] = 1.0
+ R["det"][jmax] = 1
+ else:
+ fp[d] = 1.0
+ else:
+ fp[d] = 1.0
+ # compute precision recall
+ fp = np.cumsum(fp)
+ tp = np.cumsum(tp)
+ rec = tp / float(npos)
+ # avoid divide by zero in case the first detection matches a difficult
+ # ground truth
+ prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+ ap = voc_ap(rec, prec, use_07_metric)
+ # compute unknown det as known
+ unknown_class_recs = {}
+ n_unk = 0
+ for imagename in imagenames:
+ R = [obj for obj in recs[imagename] if obj["name"] == 'unknown']
+ bbox = np.array([x["bbox"] for x in R])
+ difficult = np.array([x["difficult"] for x in R]).astype(np.bool)
+ det = [False] * len(R)
+ n_unk = n_unk + sum(~difficult)
+ unknown_class_recs[imagename] = {
+ "bbox": bbox, "difficult": difficult, "det": det}
+ if classname == 'unknown':
+ return rec, prec, ap, 0, n_unk, None, None
+ # Go down each detection and see if it has an overlap with an unknown object.
+ # If so, it is an unknown object that was classified as known.
+ is_unk = np.zeros(nd)
+ for d in range(nd):
+ R = unknown_class_recs[image_ids[d]]
+ bb = BB[d, :].astype(float)
+ ovmax = -np.inf
+ BBGT = R["bbox"].astype(float)
+ if BBGT.size > 0:
+ overlaps = compute_overlaps(BBGT, bb)
+ ovmax = np.max(overlaps)
+ jmax = np.argmax(overlaps)
+ if ovmax > ovthresh:
+ is_unk[d] = 1.0
+ is_unk_sum = np.sum(is_unk)
+ tp_plus_fp_closed_set = tp+fp
+ fp_open_set = np.cumsum(is_unk)
+ return rec, prec, ap, is_unk_sum, n_unk, tp_plus_fp_closed_set, fp_open_set
diff --git a/opendet2/modeling/__init__.py b/opendet2/modeling/__init__.py
new file mode 100644
index 0000000..7fa01a2
--- /dev/null
+++ b/opendet2/modeling/__init__.py
@@ -0,0 +1,5 @@
+from .meta_arch import OpenSetRetinaNet
+from .backbone import *
+from .roi_heads import *
+__all__ = list(globals().keys())
diff --git a/opendet2/modeling/backbone/__init__.py b/opendet2/modeling/backbone/__init__.py
new file mode 100644
index 0000000..f9cf81c
--- /dev/null
+++ b/opendet2/modeling/backbone/__init__.py
@@ -0,0 +1,3 @@
+from .swin_transformer import SwinTransformer
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
\ No newline at end of file
diff --git a/opendet2/modeling/backbone/swin_transformer.py b/opendet2/modeling/backbone/swin_transformer.py
new file mode 100644
index 0000000..d609389
--- /dev/null
+++ b/opendet2/modeling/backbone/swin_transformer.py
@@ -0,0 +1,726 @@
+# --------------------------------------------------------
+# Swin Transformer
+# modified from https://github.com/xiaohu2015/SwinT_detectron2/blob/main/swint/swin_transformer.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
+from detectron2.layers import ShapeSpec
+class Mlp(nn.Module):
+ """ Multilayer perceptron."""
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+def window_partition(x, window_size):
+ """
+ Args:
+ x: (B, H, W, C)
+ window_size (int): window size
+ Returns:
+ windows: (num_windows*B, window_size, window_size, C)
+ """
+ B, H, W, C = x.shape
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+ return windows
+def window_reverse(windows, window_size, H, W):
+ """
+ Args:
+ windows: (num_windows*B, window_size, window_size, C)
+ window_size (int): Window size
+ H (int): Height of image
+ W (int): Width of image
+ Returns:
+ x: (B, H, W, C)
+ """
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+ return x
+class WindowAttention(nn.Module):
+ """ Window based multi-head self attention (W-MSA) module with relative position bias.
+ It supports both of shifted and non-shifted window.
+ Args:
+ dim (int): Number of input channels.
+ window_size (tuple[int]): The height and width of the window.
+ num_heads (int): Number of attention heads.
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+ """
+ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.dim = dim
+ self.window_size = window_size # Wh, Ww
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+ # define a parameter table of relative position bias
+ self.relative_position_bias_table = nn.Parameter(
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
+ # get pair-wise relative position index for each token inside the window
+ coords_h = torch.arange(self.window_size[0])
+ coords_w = torch.arange(self.window_size[1])
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
+ relative_coords[:, :, 1] += self.window_size[1] - 1
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
+ self.register_buffer("relative_position_index", relative_position_index)
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+ trunc_normal_(self.relative_position_bias_table, std=.02)
+ self.softmax = nn.Softmax(dim=-1)
+ def forward(self, x, mask=None):
+ """ Forward function.
+ Args:
+ x: input features with shape of (num_windows*B, N, C)
+ mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+ """
+ B_, N, C = x.shape
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
+ q = q * self.scale
+ attn = (q @ k.transpose(-2, -1))
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
+ attn = attn + relative_position_bias.unsqueeze(0)
+ if mask is not None:
+ nW = mask.shape[0]
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+ attn = attn.view(-1, self.num_heads, N, N)
+ attn = self.softmax(attn)
+ else:
+ attn = self.softmax(attn)
+ attn = self.attn_drop(attn)
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+class SwinTransformerBlock(nn.Module):
+ """ Swin Transformer Block.
+ Args:
+ dim (int): Number of input channels.
+ num_heads (int): Number of attention heads.
+ window_size (int): Window size.
+ shift_size (int): Shift size for SW-MSA.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+ drop (float, optional): Dropout rate. Default: 0.0
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
+ act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+ """
+ def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+ act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.dim = dim
+ self.num_heads = num_heads
+ self.window_size = window_size
+ self.shift_size = shift_size
+ self.mlp_ratio = mlp_ratio
+ assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+ self.norm1 = norm_layer(dim)
+ self.attn = WindowAttention(
+ dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+ qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+ self.H = None
+ self.W = None
+ def forward(self, x, mask_matrix):
+ """ Forward function.
+ Args:
+ x: Input feature, tensor size (B, H*W, C).
+ H, W: Spatial resolution of the input feature.
+ mask_matrix: Attention mask for cyclic shift.
+ """
+ B, L, C = x.shape
+ H, W = self.H, self.W
+ assert L == H * W, "input feature has wrong size"
+ shortcut = x
+ x = self.norm1(x)
+ x = x.view(B, H, W, C)
+ # pad feature maps to multiples of window size
+ pad_l = pad_t = 0
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+ _, Hp, Wp, _ = x.shape
+ # cyclic shift
+ if self.shift_size > 0:
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+ attn_mask = mask_matrix
+ else:
+ shifted_x = x
+ attn_mask = None
+ # partition windows
+ x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
+ attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
+ # merge windows
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+ shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
+ # reverse cyclic shift
+ if self.shift_size > 0:
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+ else:
+ x = shifted_x
+ if pad_r > 0 or pad_b > 0:
+ x = x[:, :H, :W, :].contiguous()
+ x = x.view(B, H * W, C)
+ # FFN
+ x = shortcut + self.drop_path(x)
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+class PatchMerging(nn.Module):
+ """ Patch Merging Layer
+ Args:
+ dim (int): Number of input channels.
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+ """
+ def __init__(self, dim, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.dim = dim
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+ self.norm = norm_layer(4 * dim)
+ def forward(self, x, H, W):
+ """ Forward function.
+ Args:
+ x: Input feature, tensor size (B, H*W, C).
+ H, W: Spatial resolution of the input feature.
+ """
+ B, L, C = x.shape
+ assert L == H * W, "input feature has wrong size"
+ x = x.view(B, H, W, C)
+ # padding
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
+ if pad_input:
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
+ x = self.norm(x)
+ x = self.reduction(x)
+ return x
+class BasicLayer(nn.Module):
+ """ A basic Swin Transformer layer for one stage.
+ Args:
+ dim (int): Number of feature channels
+ depth (int): Depths of this stage.
+ num_heads (int): Number of attention head.
+ window_size (int): Local window size. Default: 7.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+ drop (float, optional): Dropout rate. Default: 0.0
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+ """
+ def __init__(self,
+ dim,
+ depth,
+ num_heads,
+ window_size=7,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ drop=0.,
+ attn_drop=0.,
+ drop_path=0.,
+ norm_layer=nn.LayerNorm,
+ downsample=None,
+ use_checkpoint=False):
+ super().__init__()
+ self.window_size = window_size
+ self.shift_size = window_size // 2
+ self.depth = depth
+ self.use_checkpoint = use_checkpoint
+ # build blocks
+ self.blocks = nn.ModuleList([
+ SwinTransformerBlock(
+ dim=dim,
+ num_heads=num_heads,
+ window_size=window_size,
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop,
+ attn_drop=attn_drop,
+ drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+ norm_layer=norm_layer)
+ for i in range(depth)])
+ # patch merging layer
+ if downsample is not None:
+ self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+ else:
+ self.downsample = None
+ def forward(self, x, H, W):
+ """ Forward function.
+ Args:
+ x: Input feature, tensor size (B, H*W, C).
+ H, W: Spatial resolution of the input feature.
+ """
+ # calculate attention mask for SW-MSA
+ Hp = int(np.ceil(H / self.window_size)) * self.window_size
+ Wp = int(np.ceil(W / self.window_size)) * self.window_size
+ img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
+ h_slices = (slice(0, -self.window_size),
+ slice(-self.window_size, -self.shift_size),
+ slice(-self.shift_size, None))
+ w_slices = (slice(0, -self.window_size),
+ slice(-self.window_size, -self.shift_size),
+ slice(-self.shift_size, None))
+ cnt = 0
+ for h in h_slices:
+ for w in w_slices:
+ img_mask[:, h, w, :] = cnt
+ cnt += 1
+ mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+ for blk in self.blocks:
+ blk.H, blk.W = H, W
+ if self.use_checkpoint:
+ x = checkpoint.checkpoint(blk, x, attn_mask)
+ else:
+ x = blk(x, attn_mask)
+ if self.downsample is not None:
+ x_down = self.downsample(x, H, W)
+ Wh, Ww = (H + 1) // 2, (W + 1) // 2
+ return x, H, W, x_down, Wh, Ww
+ else:
+ return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+ """ Image to Patch Embedding
+ Args:
+ patch_size (int): Patch token size. Default: 4.
+ in_chans (int): Number of input image channels. Default: 3.
+ embed_dim (int): Number of linear projection output channels. Default: 96.
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
+ """
+ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+ super().__init__()
+ patch_size = to_2tuple(patch_size)
+ self.patch_size = patch_size
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+ if norm_layer is not None:
+ self.norm = norm_layer(embed_dim)
+ else:
+ self.norm = None
+ def forward(self, x):
+ """Forward function."""
+ # padding
+ _, _, H, W = x.size()
+ if W % self.patch_size[1] != 0:
+ x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+ if H % self.patch_size[0] != 0:
+ x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+ x = self.proj(x) # B C Wh Ww
+ if self.norm is not None:
+ Wh, Ww = x.size(2), x.size(3)
+ x = x.flatten(2).transpose(1, 2)
+ x = self.norm(x)
+ x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+ return x
+class SwinTransformer(Backbone):
+ """ Swin Transformer backbone.
+ A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
+ https://arxiv.org/pdf/2103.14030
+ Args:
+ pretrain_img_size (int): Input image size for training the pretrained model,
+ used in absolute postion embedding. Default 224.
+ patch_size (int | tuple(int)): Patch size. Default: 4.
+ in_chans (int): Number of input image channels. Default: 3.
+ embed_dim (int): Number of linear projection output channels. Default: 96.
+ depths (tuple[int]): Depths of each Swin Transformer stage.
+ num_heads (tuple[int]): Number of attention head of each stage.
+ window_size (int): Window size. Default: 7.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+ drop_rate (float): Dropout rate.
+ attn_drop_rate (float): Attention dropout rate. Default: 0.
+ drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+ out_indices (Sequence[int]): Output from which stages.
+ frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+ -1 means not freezing any parameters.
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+ """
+ def __init__(self,
+ pretrain_img_size=224,
+ patch_size=4,
+ in_chans=3,
+ embed_dim=96,
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 24],
+ window_size=7,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.2,
+ norm_layer=nn.LayerNorm,
+ ape=False,
+ patch_norm=True,
+ frozen_stages=-1,
+ use_checkpoint=False,
+ out_features=None):
+ super(SwinTransformer, self).__init__()
+ self.pretrain_img_size = pretrain_img_size
+ self.num_layers = len(depths)
+ self.embed_dim = embed_dim
+ self.ape = ape
+ self.patch_norm = patch_norm
+ self.frozen_stages = frozen_stages
+ self.out_features = out_features
+ # split image into non-overlapping patches
+ self.patch_embed = PatchEmbed(
+ patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+ norm_layer=norm_layer if self.patch_norm else None)
+ # absolute position embedding
+ if self.ape:
+ pretrain_img_size = to_2tuple(pretrain_img_size)
+ patch_size = to_2tuple(patch_size)
+ patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+ self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+ trunc_normal_(self.absolute_pos_embed, std=.02)
+ self.pos_drop = nn.Dropout(p=drop_rate)
+ # stochastic depth
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
+ self._out_feature_strides = {}
+ self._out_feature_channels = {}
+ # build layers
+ self.layers = nn.ModuleList()
+ for i_layer in range(self.num_layers):
+ layer = BasicLayer(
+ dim=int(embed_dim * 2 ** i_layer),
+ depth=depths[i_layer],
+ num_heads=num_heads[i_layer],
+ window_size=window_size,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop_rate,
+ attn_drop=attn_drop_rate,
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+ norm_layer=norm_layer,
+ downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+ use_checkpoint=use_checkpoint)
+ self.layers.append(layer)
+ stage = f'stage{i_layer+2}'
+ if stage in self.out_features:
+ self._out_feature_channels[stage] = embed_dim * 2 ** i_layer
+ self._out_feature_strides[stage] = 4 * 2 ** i_layer
+ num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+ self.num_features = num_features
+ # add a norm layer for each output
+ for i_layer in range(self.num_layers):
+ stage = f'stage{i_layer+2}'
+ if stage in self.out_features:
+ layer = norm_layer(num_features[i_layer])
+ layer_name = f'norm{i_layer}'
+ self.add_module(layer_name, layer)
+ self._freeze_stages()
+ def _freeze_stages(self):
+ if self.frozen_stages >= 0:
+ self.patch_embed.eval()
+ for param in self.patch_embed.parameters():
+ param.requires_grad = False
+ if self.frozen_stages >= 1 and self.ape:
+ self.absolute_pos_embed.requires_grad = False
+ if self.frozen_stages >= 2:
+ self.pos_drop.eval()
+ for i in range(0, self.frozen_stages - 1):
+ m = self.layers[i]
+ m.eval()
+ for param in m.parameters():
+ param.requires_grad = False
+ def init_weights(self, pretrained=None):
+ """Initialize the weights in backbone.
+ Args:
+ pretrained (str, optional): Path to pre-trained weights.
+ Defaults to None.
+ """
+ def _init_weights(m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+ self.apply(_init_weights)
+ def forward(self, x):
+ """Forward function."""
+ x = self.patch_embed(x)
+ Wh, Ww = x.size(2), x.size(3)
+ if self.ape:
+ # interpolate the position embedding to the corresponding size
+ absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+ x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
+ else:
+ x = x.flatten(2).transpose(1, 2)
+ x = self.pos_drop(x)
+ outs = {}
+ for i in range(self.num_layers):
+ layer = self.layers[i]
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+ name = f'stage{i+2}'
+ if name in self.out_features:
+ norm_layer = getattr(self, f'norm{i}')
+ x_out = norm_layer(x_out)
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+ outs[name] = out
+ return outs #{"stage%d" % (i+2,): out for i, out in enumerate(outs)} #tuple(outs)
+ def train(self, mode=True):
+ """Convert the model into training mode while keep layers freezed."""
+ super(SwinTransformer, self).train(mode)
+ self._freeze_stages()
+ def output_shape(self):
+ return {
+ name: ShapeSpec(
+ channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+ )
+ for name in self.out_features
+ }
+def build_swint_backbone(cfg, input_shape):
+ """
+ Create a SwinT instance from config.
+ Returns:
+ VoVNet: a :class:`VoVNet` instance.
+ """
+ out_features = cfg.MODEL.SWINT.OUT_FEATURES
+ return SwinTransformer(
+ patch_size=4,
+ in_chans=input_shape.channels,
+ embed_dim=cfg.MODEL.SWINT.EMBED_DIM,
+ depths=cfg.MODEL.SWINT.DEPTHS,
+ num_heads=cfg.MODEL.SWINT.NUM_HEADS,
+ window_size=cfg.MODEL.SWINT.WINDOW_SIZE,
+ mlp_ratio=cfg.MODEL.SWINT.MLP_RATIO,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=cfg.MODEL.SWINT.DROP_PATH_RATE,
+ norm_layer=nn.LayerNorm,
+ ape=cfg.MODEL.SWINT.APE,
+ patch_norm=True,
+ frozen_stages=cfg.MODEL.BACKBONE.FREEZE_AT,
+ out_features=out_features
+ )
+def build_swint_fpn_backbone(cfg, input_shape: ShapeSpec):
+ """
+ Args:
+ cfg: a detectron2 CfgNode
+ Returns:
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+ """
+ bottom_up = build_swint_backbone(cfg, input_shape)
+ in_features = cfg.MODEL.FPN.IN_FEATURES
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+ backbone = FPN(
+ bottom_up=bottom_up,
+ in_features=in_features,
+ out_channels=out_channels,
+ norm=cfg.MODEL.FPN.NORM,
+ top_block=LastLevelMaxPool(),
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+ )
+ return backbone
+class LastLevelP6(nn.Module):
+ """
+ This module is used in FCOS to generate extra layers
+ """
+ def __init__(self, in_channels, out_channels, in_features="res5"):
+ super().__init__()
+ self.num_levels = 1
+ self.in_feature = in_features
+ self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+ for module in [self.p6]:
+ weight_init.c2_xavier_fill(module)
+ def forward(self, x):
+ p6 = self.p6(x)
+ return [p6]
+def build_retinanet_swint_fpn_backbone(cfg, input_shape: ShapeSpec):
+ """
+ Args:
+ cfg: a detectron2 CfgNode
+ Returns:
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+ """
+ bottom_up = build_swint_backbone(cfg, input_shape)
+ in_features = cfg.MODEL.FPN.IN_FEATURES
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+ top_levels = cfg.MODEL.FPN.TOP_LEVELS
+ in_channels_top = out_channels
+ if top_levels == 2:
+ top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
+ if top_levels == 1:
+ top_block = LastLevelP6(in_channels_top, out_channels, "p5")
+ elif top_levels == 0:
+ top_block = None
+ backbone = FPN(
+ bottom_up=bottom_up,
+ in_features=in_features,
+ out_channels=out_channels,
+ norm=cfg.MODEL.FPN.NORM,
+ top_block=top_block,
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+ )
+ return backbone
diff --git a/opendet2/modeling/layers/__init__.py b/opendet2/modeling/layers/__init__.py
new file mode 100644
index 0000000..491a0d2
--- /dev/null
+++ b/opendet2/modeling/layers/__init__.py
@@ -0,0 +1,3 @@
+from .mlp import *
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/opendet2/modeling/layers/mlp.py b/opendet2/modeling/layers/mlp.py
new file mode 100644
index 0000000..aa714d0
--- /dev/null
+++ b/opendet2/modeling/layers/mlp.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import fvcore.nn.weight_init as weight_init
+class MLP(nn.Module):
+ def __init__(self, in_dim, out_dim, hidden_dim=None):
+ super().__init__()
+ if not hidden_dim:
+ hidden_dim = in_dim
+ self.head = nn.Sequential(
+ nn.Linear(in_dim, hidden_dim),
+ nn.ReLU(inplace=True),
+ nn.Linear(hidden_dim, out_dim),
+ )
+ for layer in self.head:
+ if isinstance(layer, nn.Linear):
+ weight_init.c2_xavier_fill(layer)
+ def forward(self, x):
+ feat = self.head(x)
+ feat_norm = F.normalize(feat, dim=1)
+ return feat_norm
+class ConvMLP(nn.Module):
+ def __init__(self, in_dim, out_dim, hidden_dim=None):
+ super().__init__()
+ if not hidden_dim:
+ hidden_dim = in_dim
+ self.head = nn.Sequential(
+ nn.Conv2d(in_dim, hidden_dim, kernel_size=3, stride=1, padding=1),
+ nn.ReLU(inplace=True),
+ nn.Conv2d(hidden_dim, out_dim, kernel_size=3, stride=1, padding=1),
+ )
+ # Initialization
+ for layer in self.head:
+ if isinstance(layer, nn.Conv2d):
+ torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+ torch.nn.init.constant_(layer.bias, 0)
+ def forward(self, x):
+ feat = self.head(x)
+ feat_norm = F.normalize(feat, dim=1)
+ return feat_norm
\ No newline at end of file
diff --git a/opendet2/modeling/losses/__init__.py b/opendet2/modeling/losses/__init__.py
new file mode 100644
index 0000000..a24abdf
--- /dev/null
+++ b/opendet2/modeling/losses/__init__.py
@@ -0,0 +1,4 @@
+from .unknown_probability_loss import UPLoss
+from .instance_contrastive_loss import ICLoss
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
\ No newline at end of file
diff --git a/opendet2/modeling/losses/instance_contrastive_loss.py b/opendet2/modeling/losses/instance_contrastive_loss.py
new file mode 100644
index 0000000..bad5320
--- /dev/null
+++ b/opendet2/modeling/losses/instance_contrastive_loss.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ICLoss(nn.Module):
+ """ Instance Contrastive Loss
+ """
+ def __init__(self, tau=0.1):
+ super().__init__()
+ self.tau = tau
+ def forward(self, features, labels, queue_features, queue_labels):
+ device = features.device
+ mask = torch.eq(labels[:, None], queue_labels[:, None].T).float().to(device)
+ # compute logits
+ anchor_dot_contrast = torch.div(
+ torch.matmul(features, queue_features.T), self.tau)
+ # for numerical stability
+ logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
+ logits = anchor_dot_contrast - logits_max.detach()
+ logits_mask = torch.ones_like(logits)
+ # mask itself
+ logits_mask[logits == 0] = 0
+ mask = mask * logits_mask
+ # compute log_prob
+ exp_logits = torch.exp(logits) * logits_mask
+ log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
+ # compute mean of log-likelihood over positive
+ mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
+ # loss
+ loss = - mean_log_prob_pos.mean()
+ # trick: avoid loss nan
+ return loss if not torch.isnan(loss) else features.new_tensor(0.0)
diff --git a/opendet2/modeling/losses/unknown_probability_loss.py b/opendet2/modeling/losses/unknown_probability_loss.py
new file mode 100644
index 0000000..c5adbae
--- /dev/null
+++ b/opendet2/modeling/losses/unknown_probability_loss.py
@@ -0,0 +1,93 @@
+import torch
+import torch.distributions as dists
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+class UPLoss(nn.Module):
+ """Unknown Probability Loss
+ """
+ def __init__(self,
+ num_classes: int,
+ sampling_metric: str = "min_score",
+ topk: int = 3,
+ alpha: float = 1.0):
+ super().__init__()
+ self.num_classes = num_classes
+ assert sampling_metric in ["min_score", "max_entropy", "random"]
+ self.sampling_metric = sampling_metric
+ # if topk==-1, sample len(fg)*2 examples
+ self.topk = topk
+ self.alpha = alpha
+ def _soft_cross_entropy(self, input: Tensor, target: Tensor):
+ logprobs = F.log_softmax(input, dim=1)
+ return -(target * logprobs).sum() / input.shape[0]
+ def _sampling(self, scores: Tensor, labels: Tensor):
+ fg_inds = labels != self.num_classes
+ fg_scores, fg_labels = scores[fg_inds], labels[fg_inds]
+ bg_scores, bg_labels = scores[~fg_inds], labels[~fg_inds]
+ # remove unknown classes
+ _fg_scores = torch.cat(
+ [fg_scores[:, :self.num_classes-1], fg_scores[:, -1:]], dim=1)
+ _bg_scores = torch.cat(
+ [bg_scores[:, :self.num_classes-1], bg_scores[:, -1:]], dim=1)
+ num_fg = fg_scores.size(0)
+ topk = num_fg if (self.topk == -1) or (num_fg <
+ self.topk) else self.topk
+ # use maximum entropy as a metric for uncertainty
+ # we select topk proposals with maximum entropy
+ if self.sampling_metric == "max_entropy":
+ pos_metric = dists.Categorical(
+ _fg_scores.softmax(dim=1)).entropy()
+ neg_metric = dists.Categorical(
+ _bg_scores.softmax(dim=1)).entropy()
+ # use minimum score as a metric for uncertainty
+ # we select topk proposals with minimum max-score
+ elif self.sampling_metric == "min_score":
+ pos_metric = -_fg_scores.max(dim=1)[0]
+ neg_metric = -_bg_scores.max(dim=1)[0]
+ # we randomly select topk proposals
+ elif self.sampling_metric == "random":
+ pos_metric = torch.rand(_fg_scores.size(0),).to(scores.device)
+ neg_metric = torch.rand(_bg_scores.size(0),).to(scores.device)
+ _, pos_inds = pos_metric.topk(topk)
+ _, neg_inds = neg_metric.topk(topk)
+ fg_scores, fg_labels = fg_scores[pos_inds], fg_labels[pos_inds]
+ bg_scores, bg_labels = bg_scores[neg_inds], bg_labels[neg_inds]
+ return fg_scores, bg_scores, fg_labels, bg_labels
+ def forward(self, scores: Tensor, labels: Tensor):
+ fg_scores, bg_scores, fg_labels, bg_labels = self._sampling(
+ scores, labels)
+ # sample both fg and bg
+ scores = torch.cat([fg_scores, bg_scores])
+ labels = torch.cat([fg_labels, bg_labels])
+ num_sample, num_classes = scores.shape
+ mask = torch.arange(num_classes).repeat(
+ num_sample, 1).to(scores.device)
+ inds = mask != labels[:, None].repeat(1, num_classes)
+ mask = mask[inds].reshape(num_sample, num_classes-1)
+ gt_scores = torch.gather(
+ F.softmax(scores, dim=1), 1, labels[:, None]).squeeze(1)
+ mask_scores = torch.gather(scores, 1, mask)
+ gt_scores[gt_scores < 0] = 0.0
+ targets = torch.zeros_like(mask_scores)
+ num_fg = fg_scores.size(0)
+ targets[:num_fg, self.num_classes-2] = gt_scores[:num_fg] * \
+ (1-gt_scores[:num_fg]).pow(self.alpha)
+ targets[num_fg:, self.num_classes-1] = gt_scores[num_fg:] * \
+ (1-gt_scores[num_fg:]).pow(self.alpha)
+ return self._soft_cross_entropy(mask_scores, targets.detach())
diff --git a/opendet2/modeling/meta_arch/__init__.py b/opendet2/modeling/meta_arch/__init__.py
new file mode 100644
index 0000000..8b336b1
--- /dev/null
+++ b/opendet2/modeling/meta_arch/__init__.py
@@ -0,0 +1,3 @@
+from .retinanet import OpenSetRetinaNet
+__all__ = list(globals().keys())
diff --git a/opendet2/modeling/meta_arch/retinanet.py b/opendet2/modeling/meta_arch/retinanet.py
new file mode 100644
index 0000000..dc013bf
--- /dev/null
+++ b/opendet2/modeling/meta_arch/retinanet.py
@@ -0,0 +1,483 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Dict, List, Tuple
+import numpy as np
+import torch
+import torch.distributions as dists
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec, cat, cross_entropy
+from detectron2.modeling import META_ARCH_REGISTRY
+from detectron2.modeling.box_regression import _dense_box_regression_loss
+from detectron2.modeling.meta_arch.retinanet import RetinaNet, RetinaNetHead
+from detectron2.modeling.postprocessing import detector_postprocess
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from fvcore.nn import sigmoid_focal_loss_jit
+from torch import Tensor, nn
+from torch.nn import functional as F
+from ..layers import ConvMLP
+from ..losses import ICLoss
+logger = logging.getLogger(__name__)
+def permute_to_N_HWA_K(tensor, K: int):
+ """
+ Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K)
+ """
+ assert tensor.dim() == 4, tensor.shape
+ N, _, H, W = tensor.shape
+ tensor = tensor.view(N, -1, K, H, W)
+ tensor = tensor.permute(0, 3, 4, 1, 2)
+ tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K)
+ return tensor
+class UPLoss(nn.Module):
+ """Unknown Probability Loss for RetinaNet
+ """
+ def __init__(self,
+ num_classes: int,
+ sampling_metric: str = "min_score",
+ topk: int = 3,
+ alpha: float = 1.0):
+ super().__init__()
+ self.num_classes = num_classes
+ assert sampling_metric in ["min_score", "max_entropy", "random"]
+ self.sampling_metric = sampling_metric
+ # if topk==-1, sample len(fg)*2 examples
+ self.topk = topk
+ self.alpha = alpha
+ def _soft_cross_entropy(self, input: Tensor, target: Tensor):
+ logprobs = F.log_softmax(input, dim=1)
+ return -(target * logprobs).sum() / input.shape[0]
+ def _sampling(self, scores: Tensor, labels: Tensor):
+ fg_inds = labels != self.num_classes
+ fg_scores, fg_labels = scores[fg_inds], labels[fg_inds]
+ # remove unknown classes
+ _fg_scores = torch.cat(
+ [fg_scores[:, :self.num_classes-1], fg_scores[:, -1:]], dim=1)
+ num_fg = fg_scores.size(0)
+ topk = num_fg if (self.topk == -1) or (num_fg <
+ self.topk) else self.topk
+ # use maximum entropy as a metric for uncertainty
+ # we select topk proposals with maximum entropy
+ if self.sampling_metric == "max_entropy":
+ pos_metric = dists.Categorical(
+ _fg_scores.softmax(dim=1)).entropy()
+ # use minimum score as a metric for uncertainty
+ # we select topk proposals with minimum max-score
+ elif self.sampling_metric == "min_score":
+ pos_metric = -_fg_scores.max(dim=1)[0]
+ # we randomly select topk proposals
+ elif self.sampling_metric == "random":
+ pos_metric = torch.rand(_fg_scores.size(0),).to(scores.device)
+ _, pos_inds = pos_metric.topk(topk)
+ fg_scores, fg_labels = fg_scores[pos_inds], fg_labels[pos_inds]
+ return fg_scores, fg_labels
+ def forward(self, scores: Tensor, labels: Tensor):
+ scores, labels = self._sampling(scores, labels)
+ num_sample, num_classes = scores.shape
+ mask = torch.arange(num_classes).repeat(
+ num_sample, 1).to(scores.device)
+ inds = mask != labels[:, None].repeat(1, num_classes)
+ mask = mask[inds].reshape(num_sample, num_classes-1)
+ gt_scores = torch.gather(
+ F.softmax(scores, dim=1), 1, labels[:, None]).squeeze(1)
+ mask_scores = torch.gather(scores, 1, mask)
+ gt_scores[gt_scores < 0] = 0.0
+ targets = torch.zeros_like(mask_scores)
+ targets[:, self.num_classes-2] = gt_scores * \
+ (1-gt_scores).pow(self.alpha)
+ return self._soft_cross_entropy(mask_scores, targets.detach())
+class OpenSetRetinaNet(RetinaNet):
+ """
+ Implement RetinaNet in :paper:`RetinaNet`.
+ """
+ @configurable
+ def __init__(
+ self,
+ num_known_classes,
+ max_iters,
+ up_loss_start_iter,
+ up_loss_sampling_metric,
+ up_loss_topk,
+ up_loss_alpha,
+ up_loss_weight,
+ ins_con_out_dim,
+ ins_con_queue_size,
+ ins_con_in_queue_size,
+ ins_con_batch_iou_thr,
+ ins_con_queue_iou_thr,
+ ins_con_queue_tau,
+ ins_con_loss_weight,
+ *args,
+ **kargs,
+ ):
+ super().__init__(*args, **kargs)
+ self.num_known_classes = num_known_classes
+ self.max_iters = max_iters
+ self.up_loss = UPLoss(
+ self.num_classes,
+ sampling_metric=up_loss_sampling_metric,
+ topk=up_loss_topk,
+ alpha=up_loss_alpha
+ )
+ self.up_loss_start_iter = up_loss_start_iter
+ self.up_loss_weight = up_loss_weight
+ self.ins_con_loss = ICLoss(tau=ins_con_queue_tau)
+ self.ins_con_out_dim = ins_con_out_dim
+ self.ins_con_queue_size = ins_con_queue_size
+ self.ins_con_in_queue_size = ins_con_in_queue_size
+ self.ins_con_batch_iou_thr = ins_con_batch_iou_thr
+ self.ins_con_queue_iou_thr = ins_con_queue_iou_thr
+ self.ins_con_loss_weight = ins_con_loss_weight
+ self.register_buffer('queue', torch.zeros(
+ self.num_known_classes, ins_con_queue_size, ins_con_out_dim))
+ self.register_buffer('queue_label', torch.empty(
+ self.num_known_classes, ins_con_queue_size).fill_(-1).long())
+ self.register_buffer('queue_ptr', torch.zeros(
+ self.num_known_classes, dtype=torch.long))
+ @classmethod
+ def from_config(cls, cfg):
+ ret = super().from_config(cfg)
+ backbone_shape = ret["backbone"].output_shape()
+ feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES]
+ head = OpenSetRetinaNetHead(cfg, feature_shapes)
+ ret.update({
+ "head": head,
+ "num_known_classes": cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES,
+ "max_iters": cfg.SOLVER.MAX_ITER,
+ "up_loss_start_iter": cfg.UPLOSS.START_ITER,
+ "up_loss_sampling_metric": cfg.UPLOSS.SAMPLING_METRIC,
+ "up_loss_topk": cfg.UPLOSS.TOPK,
+ "up_loss_alpha": cfg.UPLOSS.ALPHA,
+ "up_loss_weight": cfg.UPLOSS.WEIGHT,
+ "ins_con_out_dim": cfg.ICLOSS.OUT_DIM,
+ "ins_con_queue_size": cfg.ICLOSS.QUEUE_SIZE,
+ "ins_con_in_queue_size": cfg.ICLOSS.IN_QUEUE_SIZE,
+ "ins_con_batch_iou_thr": cfg.ICLOSS.BATCH_IOU_THRESH,
+ "ins_con_queue_iou_thr": cfg.ICLOSS.QUEUE_IOU_THRESH,
+ "ins_con_queue_tau": cfg.ICLOSS.TEMPERATURE,
+ "ins_con_loss_weight": cfg.ICLOSS.WEIGHT,
+ })
+ return ret
+ def get_up_loss(self, scores, gt_classes):
+ # start up loss after warmup iters
+ storage = get_event_storage()
+ if storage.iter > self.up_loss_start_iter:
+ loss_cls_up = self.up_loss(scores, gt_classes)
+ else:
+ loss_cls_up = scores.new_tensor(0.0)
+ return self.up_loss_weight * loss_cls_up
+ def get_ins_con_loss(self, feat, gt_classes, ious):
+ # select foreground and iou > thr instance in a mini-batch
+ pos_inds = (ious > self.ins_con_batch_iou_thr) & (
+ gt_classes != self.num_classes)
+ if not pos_inds.sum():
+ return feat.new_tensor(0.0)
+ feat, gt_classes = feat[pos_inds], gt_classes[pos_inds]
+ queue = self.queue.reshape(-1, self.ins_con_out_dim)
+ queue_label = self.queue_label.reshape(-1)
+ queue_inds = queue_label != -1 # filter empty queue
+ queue, queue_label = queue[queue_inds], queue_label[queue_inds]
+ loss_ins_con = self.ins_con_loss(feat, gt_classes, queue, queue_label)
+ # loss decay
+ storage = get_event_storage()
+ decay_weight = 1.0 - storage.iter / self.max_iters
+ return self.ins_con_loss_weight * decay_weight * loss_ins_con
+ @ torch.no_grad()
+ def _dequeue_and_enqueue(self, feat, gt_classes, ious, iou_thr=0.7):
+ # 1. gather variable
+ # feat = self.concat_all_gather(feat)
+ # gt_classes = self.concat_all_gather(gt_classes)
+ # ious = self.concat_all_gather(ious)
+ # 2. filter by iou and obj, remove bg
+ keep = (ious > iou_thr) & (gt_classes != self.num_classes)
+ feat, gt_classes = feat[keep], gt_classes[keep]
+ for i in range(self.num_known_classes):
+ ptr = int(self.queue_ptr[i])
+ cls_ind = gt_classes == i
+ cls_feat, cls_gt_classes = feat[cls_ind], gt_classes[cls_ind]
+ # 3. sort by similarity, low sim ranks first
+ cls_queue = self.queue[i, self.queue_label[i] != -1]
+ _, sim_inds = F.cosine_similarity(
+ cls_feat[:, None], cls_queue[None, :], dim=-1).mean(dim=1).sort()
+ top_sim_inds = sim_inds[:self.ins_con_in_queue_size]
+ cls_feat, cls_gt_classes = cls_feat[top_sim_inds], cls_gt_classes[top_sim_inds]
+ # 4. in queue
+ batch_size = cls_feat.size(
+ 0) if ptr + cls_feat.size(0) <= self.ins_con_queue_size else self.ins_con_queue_size - ptr
+ self.queue[i, ptr:ptr+batch_size] = cls_feat[:batch_size]
+ self.queue_label[i, ptr:ptr +
+ batch_size] = cls_gt_classes[:batch_size]
+ ptr = ptr + batch_size if ptr + batch_size < self.ins_con_queue_size else 0
+ self.queue_ptr[i] = ptr
+ @ torch.no_grad()
+ def concat_all_gather(self, tensor):
+ tensors_gather = [torch.ones_like(tensor) for _ in range(
+ torch.distributed.get_world_size())]
+ torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+ output = torch.cat(tensors_gather, dim=0)
+ return output
+ def forward(self, batched_inputs: List[Dict[str, Tensor]]):
+ """
+ Args:
+ batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+ Each item in the list contains the inputs for one image.
+ For now, each item in the list is a dict that contains:
+ * image: Tensor, image in (C, H, W) format.
+ * instances: Instances
+ Other information that's included in the original dicts, such as:
+ * "height", "width" (int): the output resolution of the model, used in inference.
+ See :meth:`postprocess` for details.
+ Returns:
+ In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the
+ loss. Used during training only. In inference, the standard output format, described
+ in :doc:`/tutorials/models`.
+ """
+ images = self.preprocess_image(batched_inputs)
+ features = self.backbone(images.tensor)
+ features = [features[f] for f in self.head_in_features]
+ anchors = self.anchor_generator(features)
+ pred_logits, pred_anchor_deltas, pred_mlp_feats = self.head(features)
+ # Transpose the Hi*Wi*A dimension to the middle:
+ pred_logits = [permute_to_N_HWA_K(
+ x, self.num_classes) for x in pred_logits]
+ pred_anchor_deltas = [permute_to_N_HWA_K(
+ x, 4) for x in pred_anchor_deltas]
+ pred_mlp_feats = [permute_to_N_HWA_K(
+ x, self.ins_con_out_dim) for x in pred_mlp_feats]
+ if self.training:
+ assert not torch.jit.is_scripting(), "Not supported"
+ assert "instances" in batched_inputs[0], "Instance annotations are missing in training!"
+ gt_instances = [x["instances"].to(
+ self.device) for x in batched_inputs]
+ gt_labels, gt_boxes, gt_ious = self.label_anchors(
+ anchors, gt_instances)
+ losses = self.losses(anchors, pred_logits, pred_mlp_feats,
+ gt_labels, pred_anchor_deltas, gt_boxes, gt_ious)
+ if self.vis_period > 0:
+ storage = get_event_storage()
+ if storage.iter % self.vis_period == 0:
+ results = self.inference(
+ anchors, pred_logits, pred_anchor_deltas, images.image_sizes
+ )
+ self.visualize_training(batched_inputs, results)
+ return losses
+ else:
+ results = self.inference(
+ anchors, pred_logits, pred_anchor_deltas, images.image_sizes)
+ if torch.jit.is_scripting():
+ return results
+ processed_results = []
+ for results_per_image, input_per_image, image_size in zip(
+ results, batched_inputs, images.image_sizes
+ ):
+ height = input_per_image.get("height", image_size[0])
+ width = input_per_image.get("width", image_size[1])
+ r = detector_postprocess(results_per_image, height, width)
+ processed_results.append({"instances": r})
+ return processed_results
+ def losses(self, anchors, pred_logits, pred_mlp_feats, gt_labels, pred_anchor_deltas, gt_boxes, gt_ious):
+ """
+ Args:
+ anchors (list[Boxes]): a list of #feature level Boxes
+ gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
+ Their shapes are (N, R) and (N, R, 4), respectively, where R is
+ the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
+ pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
+ list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
+ Where K is the number of classes used in `pred_logits`.
+ Returns:
+ dict[str, Tensor]:
+ mapping from a named loss to a scalar tensor
+ storing the loss. Used during training only. The dict keys are:
+ "loss_cls" and "loss_box_reg"
+ """
+ num_images = len(gt_labels)
+ gt_labels = torch.stack(gt_labels) # (N, R)
+ valid_mask = gt_labels >= 0
+ pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
+ num_pos_anchors = pos_mask.sum().item()
+ get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
+ self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + (
+ 1 - self.loss_normalizer_momentum
+ ) * max(num_pos_anchors, 1)
+ # classification and regression loss
+ gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[
+ :, :-1
+ ] # no loss for the last (background) class
+ loss_cls_ce = sigmoid_focal_loss_jit(
+ cat(pred_logits, dim=1)[valid_mask],
+ gt_labels_target.to(pred_logits[0].dtype),
+ alpha=self.focal_loss_alpha,
+ gamma=self.focal_loss_gamma,
+ reduction="sum",
+ )
+ loss_cls_up = self.get_up_loss(cat(pred_logits, dim=1)[
+ valid_mask], gt_labels[valid_mask])
+ gt_ious = torch.stack(gt_ious)
+ # we first store feats in the queue, then cmopute the loss
+ pred_mlp_feats = cat(pred_mlp_feats, dim=1)[valid_mask] # [N, *, 128]
+ # [N*, 128]
+ pred_mlp_feats = pred_mlp_feats.reshape(-1, pred_mlp_feats.shape[-1])
+ self._dequeue_and_enqueue(
+ pred_mlp_feats, gt_labels[valid_mask], gt_ious[valid_mask], iou_thr=self.ins_con_queue_iou_thr)
+ loss_ins_con = self.get_ins_con_loss(
+ pred_mlp_feats, gt_labels[valid_mask], gt_ious[valid_mask])
+ loss_box_reg = _dense_box_regression_loss(
+ anchors,
+ self.box2box_transform,
+ pred_anchor_deltas,
+ gt_boxes,
+ pos_mask,
+ box_reg_loss_type=self.box_reg_loss_type,
+ smooth_l1_beta=self.smooth_l1_beta,
+ )
+ return {
+ "loss_cls_ce": loss_cls_ce / self.loss_normalizer,
+ "loss_box_reg": loss_box_reg / self.loss_normalizer,
+ "loss_ins_con": loss_ins_con,
+ "loss_cls_up": loss_cls_up,
+ }
+ @torch.no_grad()
+ def label_anchors(self, anchors, gt_instances):
+ anchors = Boxes.cat(anchors) # Rx4
+ gt_labels = []
+ matched_gt_boxes = []
+ matched_gt_ious = []
+ for gt_per_image in gt_instances:
+ match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
+ matched_idxs, anchor_labels = self.anchor_matcher(
+ match_quality_matrix)
+ # del match_quality_matrix
+ if len(gt_per_image) > 0:
+ matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]
+ matched_gt_ious_i = match_quality_matrix.max(dim=1)[
+ 0][matched_idxs]
+ gt_labels_i = gt_per_image.gt_classes[matched_idxs]
+ # Anchors with label 0 are treated as background.
+ gt_labels_i[anchor_labels == 0] = self.num_classes
+ # Anchors with label -1 are ignored.
+ gt_labels_i[anchor_labels == -1] = -1
+ else:
+ matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+ matched_gt_ious_i = torch.zeros_like(matched_idxs)
+ gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
+ gt_labels.append(gt_labels_i)
+ matched_gt_boxes.append(matched_gt_boxes_i)
+ matched_gt_ious.append(matched_gt_ious_i)
+ del match_quality_matrix
+ return gt_labels, matched_gt_boxes, matched_gt_ious
+class OpenSetRetinaNetHead(RetinaNetHead):
+ """
+ The head used in RetinaNet for object classification and box regression.
+ It has two subnets for the two tasks, with a common structure but separate parameters.
+ """
+ @configurable
+ def __init__(
+ self,
+ *args,
+ ins_con_out_dim,
+ **kargs
+ ):
+ super().__init__(*args, **kargs)
+ self.mlp = ConvMLP(kargs["conv_dims"][-1], ins_con_out_dim * kargs["num_anchors"])
+ @classmethod
+ def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+ ret = super().from_config(cfg, input_shape)
+ ret["ins_con_out_dim"] = cfg.ICLOSS.OUT_DIM
+ return ret
+ def forward(self, features: List[Tensor]):
+ """
+ Arguments:
+ features (list[Tensor]): FPN feature map tensors in high to low resolution.
+ Each tensor in the list correspond to different feature levels.
+ Returns:
+ logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
+ The tensor predicts the classification probability
+ at each spatial position for each of the A anchors and K object
+ classes.
+ bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
+ The tensor predicts 4-vector (dx,dy,dw,dh) box
+ regression values for every anchor. These values are the
+ relative offset between the anchor and the ground truth box.
+ """
+ logits = []
+ mlp_feats = []
+ bbox_reg = []
+ for feature in features:
+ cls_feat = self.cls_subnet(feature)
+ mlp_feats.append(self.mlp(cls_feat))
+ logits.append(self.cls_score(cls_feat))
+ bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
+ return logits, bbox_reg, mlp_feats
diff --git a/opendet2/modeling/roi_heads/__init__.py b/opendet2/modeling/roi_heads/__init__.py
new file mode 100644
index 0000000..546b918
--- /dev/null
+++ b/opendet2/modeling/roi_heads/__init__.py
@@ -0,0 +1,4 @@
+from .roi_heads import OpenSetStandardROIHeads
+from .box_head import FastRCNNSeparateConvFCHead, FastRCNNSeparateDropoutConvFCHead
+__all__ = list(globals().keys())
diff --git a/opendet2/modeling/roi_heads/box_head.py b/opendet2/modeling/roi_heads/box_head.py
new file mode 100644
index 0000000..6dc4d19
--- /dev/null
+++ b/opendet2/modeling/roi_heads/box_head.py
@@ -0,0 +1,163 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import fvcore.nn.weight_init as weight_init
+import numpy as np
+import torch
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling.roi_heads import ROI_BOX_HEAD_REGISTRY
+from detectron2.utils.registry import Registry
+from torch import nn
+class FastRCNNSeparateConvFCHead(nn.Module):
+ """
+ FastRCNN with separate ConvFC layers
+ """
+ @configurable
+ def __init__(
+ self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm=""
+ ):
+ """
+ NOTE: this interface is experimental.
+ Args:
+ input_shape (ShapeSpec): shape of the input feature.
+ conv_dims (list[int]): the output dimensions of the conv layers
+ fc_dims (list[int]): the output dimensions of the fc layers
+ conv_norm (str or callable): normalization for the conv layers.
+ See :func:`detectron2.layers.get_norm` for supported types.
+ """
+ super().__init__()
+ assert len(conv_dims) + len(fc_dims) > 0
+ self.conv_dims = conv_dims
+ self.fc_dims = fc_dims
+ self._output_size = (input_shape.channels,
+ input_shape.height, input_shape.width)
+ self.reg_conv_norm_relus = self._add_conv_norm_relus(
+ self._output_size[0], conv_dims, conv_norm)
+ self.cls_conv_norm_relus = self._add_conv_norm_relus(
+ self._output_size[0], conv_dims, conv_norm)
+ conv_dim = self._output_size[0] if len(conv_dims) == 0 else conv_dims[-1]
+ self._output_size = (
+ conv_dim, self._output_size[1], self._output_size[2])
+ self.reg_fcs = self._add_fcs(np.prod(self._output_size), fc_dims)
+ self.cls_fcs = self._add_fcs(np.prod(self._output_size), fc_dims)
+ self._output_size = self._output_size if len(fc_dims)==0 else fc_dims[-1]
+ for layer in self.reg_conv_norm_relus:
+ weight_init.c2_msra_fill(layer)
+ for layer in self.cls_conv_norm_relus:
+ weight_init.c2_msra_fill(layer)
+ for layer in self.cls_fcs:
+ if isinstance(layer, nn.Linear):
+ weight_init.c2_xavier_fill(layer)
+ for layer in self.reg_fcs:
+ if isinstance(layer, nn.Linear):
+ weight_init.c2_xavier_fill(layer)
+ @classmethod
+ def from_config(cls, cfg, input_shape):
+ num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV
+ conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM
+ num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC
+ fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
+ return {
+ "input_shape": input_shape,
+ "conv_dims": [conv_dim] * num_conv,
+ "fc_dims": [fc_dim] * num_fc,
+ "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM,
+ }
+ def _add_conv_norm_relus(self, input_dim, conv_dims, conv_norm):
+ conv_norm_relus = []
+ for k, conv_dim in enumerate(conv_dims):
+ conv = Conv2d(
+ input_dim,
+ conv_dim,
+ kernel_size=3,
+ padding=1,
+ bias=not conv_norm,
+ norm=get_norm(conv_norm, conv_dim),
+ activation=nn.ReLU(),
+ )
+ input_dim = conv_dim
+ conv_norm_relus.append(conv)
+ return nn.Sequential(*conv_norm_relus)
+ def _add_fcs(self, input_dim, fc_dims):
+ fcs = []
+ for k, fc_dim in enumerate(fc_dims):
+ if k == 0:
+ fcs.append(nn.Flatten())
+ fc = nn.Linear(int(input_dim), fc_dim)
+ fcs.append(fc)
+ fcs.append(nn.ReLU())
+ input_dim = fc_dim
+ return nn.Sequential(*fcs)
+ def forward(self, x):
+ reg_feat = x
+ cls_feat = x
+ if len(self.conv_dims) > 0:
+ reg_feat = self.reg_conv_norm_relus(x)
+ cls_feat = self.cls_conv_norm_relus(x)
+ if len(self.fc_dims) > 0:
+ reg_feat = self.reg_fcs(reg_feat)
+ cls_feat = self.cls_fcs(cls_feat)
+ return reg_feat, cls_feat
+ @property
+ @torch.jit.unused
+ def output_shape(self):
+ """
+ Returns:
+ ShapeSpec: the output feature shape
+ """
+ o = self._output_size
+ if isinstance(o, int):
+ return ShapeSpec(channels=o)
+ else:
+ return ShapeSpec(channels=o[0], height=o[1], width=o[2])
+class FastRCNNSeparateDropoutConvFCHead(nn.Module):
+ """Add dropout before each conv/fc layer
+ """
+ def _add_conv_norm_relus(self, input_dim, conv_dims, conv_norm):
+ conv_norm_relus = []
+ for k, conv_dim in enumerate(conv_dims):
+ conv = Conv2d(
+ input_dim,
+ conv_dim,
+ kernel_size=3,
+ padding=1,
+ bias=not conv_norm,
+ norm=get_norm(conv_norm, conv_dim),
+ activation=nn.ReLU(),
+ )
+ input_dim = conv_dim
+ conv_norm_relus.append(nn.Dropout2d(p=0.5))
+ conv_norm_relus.append(conv)
+ return nn.Sequential(*conv_norm_relus)
+ def _add_fcs(self, input_dim, fc_dims):
+ fcs = []
+ for k, fc_dim in enumerate(fc_dims):
+ if k == 0:
+ fcs.append(nn.Flatten())
+ fc = nn.Linear(int(input_dim), fc_dim)
+ fcs.append(nn.Dropout2d(p=0.5))
+ fcs.append(fc)
+ fcs.append(nn.ReLU())
+ input_dim = fc_dim
+ return nn.Sequential(*fcs)
diff --git a/opendet2/modeling/roi_heads/fast_rcnn.py b/opendet2/modeling/roi_heads/fast_rcnn.py
new file mode 100644
index 0000000..6cfb516
--- /dev/null
+++ b/opendet2/modeling/roi_heads/fast_rcnn.py
@@ -0,0 +1,645 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import math
+import os
+import random
+from typing import Dict, List, Tuple, Union
+import numpy as np
+import torch
+import torch.distributions as dists
+from detectron2.config import configurable
+from detectron2.layers import (ShapeSpec, batched_nms, cat, cross_entropy,
+ nonzero_tuple)
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.roi_heads.fast_rcnn import (FastRCNNOutputLayers,
+ _log_classification_stats)
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.structures.boxes import matched_boxlist_iou
+# fast_rcnn_inference)
+from detectron2.utils import comm
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+from fvcore.nn import giou_loss, smooth_l1_loss
+from torch import nn
+from torch.nn import functional as F
+from ..layers import MLP
+from ..losses import ICLoss, UPLoss
+def fast_rcnn_inference(
+ boxes: List[torch.Tensor],
+ scores: List[torch.Tensor],
+ image_shapes: List[Tuple[int, int]],
+ score_thresh: float,
+ nms_thresh: float,
+ topk_per_image: int,
+ vis_iou_thr: float = 1.0,
+ result_per_image = [
+ fast_rcnn_inference_single_image(
+ boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image, vis_iou_thr
+ )
+ for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+ ]
+ return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
+def fast_rcnn_inference_single_image(
+ boxes,
+ scores,
+ image_shape: Tuple[int, int],
+ score_thresh: float,
+ nms_thresh: float,
+ topk_per_image: int,
+ vis_iou_thr: float,
+ valid_mask = torch.isfinite(boxes).all(
+ dim=1) & torch.isfinite(scores).all(dim=1)
+ if not valid_mask.all():
+ boxes = boxes[valid_mask]
+ scores = scores[valid_mask]
+ scores = scores[:, :-1]
+ num_bbox_reg_classes = boxes.shape[1] // 4
+ # Convert to Boxes to use the `clip` function ...
+ boxes = Boxes(boxes.reshape(-1, 4))
+ boxes.clip(image_shape)
+ boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4
+ # 1. Filter results based on detection scores. It can make NMS more efficient
+ # by filtering out low-confidence detections.
+ filter_mask = scores > score_thresh # R x K
+ # R' x 2. First column contains indices of the R predictions;
+ # Second column contains indices of classes.
+ filter_inds = filter_mask.nonzero()
+ if num_bbox_reg_classes == 1:
+ boxes = boxes[filter_inds[:, 0], 0]
+ else:
+ boxes = boxes[filter_mask]
+ scores = scores[filter_mask]
+ # 2. Apply NMS for each class independently.
+ keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
+ if topk_per_image >= 0:
+ keep = keep[:topk_per_image]
+ boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+ # apply nms between known classes and unknown class for visualization.
+ if vis_iou_thr < 1.0:
+ boxes, scores, filter_inds = unknown_aware_nms(
+ boxes, scores, filter_inds, iou_thr=vis_iou_thr)
+ result = Instances(image_shape)
+ result.pred_boxes = Boxes(boxes)
+ result.scores = scores
+ result.pred_classes = filter_inds[:, 1]
+ return result, filter_inds[:, 0]
+def unknown_aware_nms(boxes, scores, labels, ukn_class_id=80, iou_thr=0.9):
+ u_inds = labels[:, 1] == ukn_class_id
+ k_inds = ~u_inds
+ if k_inds.sum() == 0 or u_inds.sum() == 0:
+ return boxes, scores, labels
+ k_boxes, k_scores, k_labels = boxes[k_inds], scores[k_inds], labels[k_inds]
+ u_boxes, u_scores, u_labels = boxes[u_inds], scores[u_inds], labels[u_inds]
+ ious = pairwise_iou(Boxes(k_boxes), Boxes(u_boxes))
+ mask = torch.ones((ious.size(0), ious.size(1), 2), device=ious.device)
+ inds = (ious > iou_thr).nonzero()
+ if not inds.numel():
+ return boxes, scores, labels
+ for [ind_x, ind_y] in inds:
+ if k_scores[ind_x] >= u_scores[ind_y]:
+ mask[ind_x, ind_y, 1] = 0
+ else:
+ mask[ind_x, ind_y, 0] = 0
+ k_inds = mask[..., 0].mean(dim=1) == 1
+ u_inds = mask[..., 1].mean(dim=0) == 1
+ k_boxes, k_scores, k_labels = k_boxes[k_inds], k_scores[k_inds], k_labels[k_inds]
+ u_boxes, u_scores, u_labels = u_boxes[u_inds], u_scores[u_inds], u_labels[u_inds]
+ boxes = torch.cat([k_boxes, u_boxes])
+ scores = torch.cat([k_scores, u_scores])
+ labels = torch.cat([k_labels, u_labels])
+ return boxes, scores, labels
+logger = logging.getLogger(__name__)
+def build_roi_box_output_layers(cfg, input_shape):
+ """
+ Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`.
+ """
+ return ROI_BOX_OUTPUT_LAYERS_REGISTRY.get(name)(cfg, input_shape)
+class CosineFastRCNNOutputLayers(FastRCNNOutputLayers):
+ @configurable
+ def __init__(
+ self,
+ *args,
+ scale: int = 20,
+ vis_iou_thr: float = 1.0,
+ **kargs,
+ ):
+ super().__init__(*args, **kargs)
+ # prediction layer for num_classes foreground classes and one background class (hence + 1)
+ self.cls_score = nn.Linear(
+ self.cls_score.in_features, self.num_classes + 1, bias=False)
+ nn.init.normal_(self.cls_score.weight, std=0.01)
+ # scaling factor
+ self.scale = scale
+ self.vis_iou_thr = vis_iou_thr
+ @classmethod
+ def from_config(cls, cfg, input_shape):
+ ret = super().from_config(cfg, input_shape)
+ ret['scale'] = cfg.MODEL.ROI_HEADS.COSINE_SCALE
+ ret['vis_iou_thr'] = cfg.MODEL.ROI_HEADS.VIS_IOU_THRESH
+ return ret
+ def forward(self, feats):
+ # support shared & sepearte head
+ if isinstance(feats, tuple):
+ reg_x, cls_x = feats
+ else:
+ reg_x = cls_x = feats
+ if reg_x.dim() > 2:
+ reg_x = torch.flatten(reg_x, start_dim=1)
+ cls_x = torch.flatten(cls_x, start_dim=1)
+ x_norm = torch.norm(cls_x, p=2, dim=1).unsqueeze(1).expand_as(cls_x)
+ x_normalized = cls_x.div(x_norm + 1e-5)
+ # normalize weight
+ temp_norm = (
+ torch.norm(self.cls_score.weight.data, p=2, dim=1)
+ .unsqueeze(1)
+ .expand_as(self.cls_score.weight.data)
+ )
+ self.cls_score.weight.data = self.cls_score.weight.data.div(
+ temp_norm + 1e-5
+ )
+ cos_dist = self.cls_score(x_normalized)
+ scores = self.scale * cos_dist
+ proposal_deltas = self.bbox_pred(reg_x)
+ return scores, proposal_deltas
+ def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
+ boxes = self.predict_boxes(predictions, proposals)
+ scores = self.predict_probs(predictions, proposals)
+ image_shapes = [x.image_size for x in proposals]
+ return fast_rcnn_inference(
+ boxes,
+ scores,
+ image_shapes,
+ self.test_score_thresh,
+ self.test_nms_thresh,
+ self.test_topk_per_image,
+ self.vis_iou_thr,
+ )
+ def predict_boxes(
+ self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
+ ):
+ if not len(proposals):
+ return []
+ proposal_deltas = predictions[1]
+ num_prop_per_image = [len(p) for p in proposals]
+ proposal_boxes = cat(
+ [p.proposal_boxes.tensor for p in proposals], dim=0)
+ predict_boxes = self.box2box_transform.apply_deltas(
+ proposal_deltas,
+ proposal_boxes,
+ ) # Nx(KxB)
+ return predict_boxes.split(num_prop_per_image)
+ def predict_probs(
+ self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
+ ):
+ scores = predictions[0]
+ num_inst_per_image = [len(p) for p in proposals]
+ probs = F.softmax(scores, dim=-1)
+ return probs.split(num_inst_per_image, dim=0)
+class OpenDetFastRCNNOutputLayers(CosineFastRCNNOutputLayers):
+ @configurable
+ def __init__(
+ self,
+ *args,
+ num_known_classes,
+ max_iters,
+ up_loss_start_iter,
+ up_loss_sampling_metric,
+ up_loss_topk,
+ up_loss_alpha,
+ up_loss_weight,
+ ic_loss_out_dim,
+ ic_loss_queue_size,
+ ic_loss_in_queue_size,
+ ic_loss_batch_iou_thr,
+ ic_loss_queue_iou_thr,
+ ic_loss_queue_tau,
+ ic_loss_weight,
+ **kargs
+ ):
+ super().__init__(*args, **kargs)
+ self.num_known_classes = num_known_classes
+ self.max_iters = max_iters
+ self.up_loss = UPLoss(
+ self.num_classes,
+ sampling_metric=up_loss_sampling_metric,
+ topk=up_loss_topk,
+ alpha=up_loss_alpha
+ )
+ self.up_loss_start_iter = up_loss_start_iter
+ self.up_loss_weight = up_loss_weight
+ self.encoder = MLP(self.cls_score.in_features, ic_loss_out_dim)
+ self.ic_loss_loss = ICLoss(tau=ic_loss_queue_tau)
+ self.ic_loss_out_dim = ic_loss_out_dim
+ self.ic_loss_queue_size = ic_loss_queue_size
+ self.ic_loss_in_queue_size = ic_loss_in_queue_size
+ self.ic_loss_batch_iou_thr = ic_loss_batch_iou_thr
+ self.ic_loss_queue_iou_thr = ic_loss_queue_iou_thr
+ self.ic_loss_weight = ic_loss_weight
+ self.register_buffer('queue', torch.zeros(
+ self.num_known_classes, ic_loss_queue_size, ic_loss_out_dim))
+ self.register_buffer('queue_label', torch.empty(
+ self.num_known_classes, ic_loss_queue_size).fill_(-1).long())
+ self.register_buffer('queue_ptr', torch.zeros(
+ self.num_known_classes, dtype=torch.long))
+ @classmethod
+ def from_config(cls, cfg, input_shape):
+ ret = super().from_config(cfg, input_shape)
+ ret.update({
+ 'num_known_classes': cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES,
+ "max_iters": cfg.SOLVER.MAX_ITER,
+ "up_loss_start_iter": cfg.UPLOSS.START_ITER,
+ "up_loss_sampling_metric": cfg.UPLOSS.SAMPLING_METRIC,
+ "up_loss_topk": cfg.UPLOSS.TOPK,
+ "up_loss_alpha": cfg.UPLOSS.ALPHA,
+ "up_loss_weight": cfg.UPLOSS.WEIGHT,
+ "ic_loss_out_dim": cfg.ICLOSS.OUT_DIM,
+ "ic_loss_queue_size": cfg.ICLOSS.QUEUE_SIZE,
+ "ic_loss_in_queue_size": cfg.ICLOSS.IN_QUEUE_SIZE,
+ "ic_loss_batch_iou_thr": cfg.ICLOSS.BATCH_IOU_THRESH,
+ "ic_loss_queue_iou_thr": cfg.ICLOSS.QUEUE_IOU_THRESH,
+ "ic_loss_queue_tau": cfg.ICLOSS.TEMPERATURE,
+ "ic_loss_weight": cfg.ICLOSS.WEIGHT,
+ })
+ return ret
+ def forward(self, feats):
+ # support shared & sepearte head
+ if isinstance(feats, tuple):
+ reg_x, cls_x = feats
+ else:
+ reg_x = cls_x = feats
+ if reg_x.dim() > 2:
+ reg_x = torch.flatten(reg_x, start_dim=1)
+ cls_x = torch.flatten(cls_x, start_dim=1)
+ x_norm = torch.norm(cls_x, p=2, dim=1).unsqueeze(1).expand_as(cls_x)
+ x_normalized = cls_x.div(x_norm + 1e-5)
+ # normalize weight
+ temp_norm = (
+ torch.norm(self.cls_score.weight.data, p=2, dim=1)
+ .unsqueeze(1)
+ .expand_as(self.cls_score.weight.data)
+ )
+ self.cls_score.weight.data = self.cls_score.weight.data.div(
+ temp_norm + 1e-5
+ )
+ cos_dist = self.cls_score(x_normalized)
+ scores = self.scale * cos_dist
+ proposal_deltas = self.bbox_pred(reg_x)
+ # encode feature with MLP
+ mlp_feat = self.encoder(cls_x)
+ return scores, proposal_deltas, mlp_feat
+ def get_up_loss(self, scores, gt_classes):
+ # start up loss after several warmup iters
+ storage = get_event_storage()
+ if storage.iter > self.up_loss_start_iter:
+ loss_cls_up = self.up_loss(scores, gt_classes)
+ else:
+ loss_cls_up = scores.new_tensor(0.0)
+ return {"loss_cls_up": self.up_loss_weight * loss_cls_up}
+ def get_ic_loss(self, feat, gt_classes, ious):
+ # select foreground and iou > thr instance in a mini-batch
+ pos_inds = (ious > self.ic_loss_batch_iou_thr) & (
+ gt_classes != self.num_classes)
+ feat, gt_classes = feat[pos_inds], gt_classes[pos_inds]
+ queue = self.queue.reshape(-1, self.ic_loss_out_dim)
+ queue_label = self.queue_label.reshape(-1)
+ queue_inds = queue_label != -1 # filter empty queue
+ queue, queue_label = queue[queue_inds], queue_label[queue_inds]
+ loss_ic_loss = self.ic_loss_loss(feat, gt_classes, queue, queue_label)
+ # loss decay
+ storage = get_event_storage()
+ decay_weight = 1.0 - storage.iter / self.max_iters
+ return {"loss_cls_ic": self.ic_loss_weight * decay_weight * loss_ic_loss}
+ @torch.no_grad()
+ def _dequeue_and_enqueue(self, feat, gt_classes, ious, iou_thr=0.7):
+ # 1. gather variable
+ feat = self.concat_all_gather(feat)
+ gt_classes = self.concat_all_gather(gt_classes)
+ ious = self.concat_all_gather(ious)
+ # 2. filter by iou and obj, remove bg
+ keep = (ious > iou_thr) & (gt_classes != self.num_classes)
+ feat, gt_classes = feat[keep], gt_classes[keep]
+ for i in range(self.num_known_classes):
+ ptr = int(self.queue_ptr[i])
+ cls_ind = gt_classes == i
+ cls_feat, cls_gt_classes = feat[cls_ind], gt_classes[cls_ind]
+ # 3. sort by similarity, low sim ranks first
+ cls_queue = self.queue[i, self.queue_label[i] != -1]
+ _, sim_inds = F.cosine_similarity(
+ cls_feat[:, None], cls_queue[None, :], dim=-1).mean(dim=1).sort()
+ top_sim_inds = sim_inds[:self.ic_loss_in_queue_size]
+ cls_feat, cls_gt_classes = cls_feat[top_sim_inds], cls_gt_classes[top_sim_inds]
+ # 4. in queue
+ batch_size = cls_feat.size(
+ 0) if ptr + cls_feat.size(0) <= self.ic_loss_queue_size else self.ic_loss_queue_size - ptr
+ self.queue[i, ptr:ptr+batch_size] = cls_feat[:batch_size]
+ self.queue_label[i, ptr:ptr + batch_size] = cls_gt_classes[:batch_size]
+ ptr = ptr + batch_size if ptr + batch_size < self.ic_loss_queue_size else 0
+ self.queue_ptr[i] = ptr
+ @torch.no_grad()
+ def concat_all_gather(self, tensor):
+ world_size = comm.get_world_size()
+ # single GPU, directly return the tensor
+ if world_size == 1:
+ return tensor
+ # multiple GPUs, gather tensors
+ tensors_gather = [torch.ones_like(tensor) for _ in range(world_size)]
+ torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+ output = torch.cat(tensors_gather, dim=0)
+ return output
+ def losses(self, predictions, proposals, input_features=None):
+ """
+ Args:
+ predictions: return values of :meth:`forward()`.
+ proposals (list[Instances]): proposals that match the features that were used
+ to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
+ ``gt_classes`` are expected.
+ Returns:
+ Dict[str, Tensor]: dict of losses
+ """
+ scores, proposal_deltas, mlp_feat = predictions
+ # parse classification outputs
+ gt_classes = (
+ cat([p.gt_classes for p in proposals], dim=0) if len(
+ proposals) else torch.empty(0)
+ )
+ _log_classification_stats(scores, gt_classes)
+ # parse box regression outputs
+ if len(proposals):
+ proposal_boxes = cat(
+ [p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4
+ assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
+ # If "gt_boxes" does not exist, the proposals must be all negative and
+ # should not be included in regression loss computation.
+ # Here we just use proposal_boxes as an arbitrary placeholder because its
+ # value won't be used in self.box_reg_loss().
+ gt_boxes = cat(
+ [(p.gt_boxes if p.has("gt_boxes")
+ else p.proposal_boxes).tensor for p in proposals],
+ dim=0,
+ )
+ else:
+ proposal_boxes = gt_boxes = torch.empty(
+ (0, 4), device=proposal_deltas.device)
+ losses = {
+ "loss_cls_ce": cross_entropy(scores, gt_classes, reduction="mean"),
+ "loss_box_reg": self.box_reg_loss(
+ proposal_boxes, gt_boxes, proposal_deltas, gt_classes
+ ),
+ }
+ # up loss
+ losses.update(self.get_up_loss(scores, gt_classes))
+ ious = cat([p.iou for p in proposals], dim=0)
+ # we first store feats in the queue, then cmopute loss
+ self._dequeue_and_enqueue(
+ mlp_feat, gt_classes, ious, iou_thr=self.ic_loss_queue_iou_thr)
+ losses.update(self.get_ic_loss(mlp_feat, gt_classes, ious))
+ return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+class PROSERFastRCNNOutputLayers(CosineFastRCNNOutputLayers):
+ """
+ @configurable
+ def __init__(self, *args, **kargs):
+ super().__init__(*args, **kargs)
+ self.proser_weight = 0.1
+ def get_proser_loss(self, scores, gt_classes):
+ num_sample, num_classes = scores.shape
+ mask = torch.arange(num_classes).repeat(
+ num_sample, 1).to(scores.device)
+ inds = mask != gt_classes[:, None].repeat(1, num_classes)
+ mask = mask[inds].reshape(num_sample, num_classes-1)
+ mask_scores = torch.gather(scores, 1, mask)
+ targets = torch.zeros_like(gt_classes)
+ fg_inds = gt_classes != self.num_classes
+ targets[fg_inds] = self.num_classes-2
+ targets[~fg_inds] = self.num_classes-1
+ loss_cls_proser = cross_entropy(mask_scores, targets)
+ return {"loss_cls_proser": self.proser_weight * loss_cls_proser}
+ def losses(self, predictions, proposals, input_features=None):
+ """
+ Args:
+ predictions: return values of :meth:`forward()`.
+ proposals (list[Instances]): proposals that match the features that were used
+ to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
+ ``gt_classes`` are expected.
+ Returns:
+ Dict[str, Tensor]: dict of losses
+ """
+ scores, proposal_deltas = predictions
+ # parse classification outputs
+ gt_classes = (
+ cat([p.gt_classes for p in proposals], dim=0) if len(
+ proposals) else torch.empty(0)
+ )
+ _log_classification_stats(scores, gt_classes)
+ # parse box regression outputs
+ if len(proposals):
+ proposal_boxes = cat(
+ [p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4
+ assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
+ # If "gt_boxes" does not exist, the proposals must be all negative and
+ # should not be included in regression loss computation.
+ # Here we just use proposal_boxes as an arbitrary placeholder because its
+ # value won't be used in self.box_reg_loss().
+ gt_boxes = cat(
+ [(p.gt_boxes if p.has("gt_boxes")
+ else p.proposal_boxes).tensor for p in proposals],
+ dim=0,
+ )
+ else:
+ proposal_boxes = gt_boxes = torch.empty(
+ (0, 4), device=proposal_deltas.device)
+ losses = {
+ "loss_cls_ce": cross_entropy(scores, gt_classes, reduction="mean"),
+ "loss_box_reg": self.box_reg_loss(
+ proposal_boxes, gt_boxes, proposal_deltas, gt_classes
+ ),
+ }
+ losses.update(self.get_proser_loss(scores, gt_classes))
+ return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+class DropoutFastRCNNOutputLayers(CosineFastRCNNOutputLayers):
+ @configurable
+ def __init__(self, *args, **kargs):
+ super().__init__(*args, **kargs)
+ self.dropout = nn.Dropout(p=0.5)
+ self.entropy_thr = 0.25
+ def forward(self, feats, testing=False):
+ # support shared & sepearte head
+ if isinstance(feats, tuple):
+ reg_x, cls_x = feats
+ else:
+ reg_x = cls_x = feats
+ if reg_x.dim() > 2:
+ reg_x = torch.flatten(reg_x, start_dim=1)
+ cls_x = torch.flatten(cls_x, start_dim=1)
+ x_norm = torch.norm(cls_x, p=2, dim=1).unsqueeze(1).expand_as(cls_x)
+ x_normalized = cls_x.div(x_norm + 1e-5)
+ # normalize weight
+ temp_norm = (
+ torch.norm(self.cls_score.weight.data, p=2, dim=1)
+ .unsqueeze(1)
+ .expand_as(self.cls_score.weight.data)
+ )
+ self.cls_score.weight.data = self.cls_score.weight.data.div(
+ temp_norm + 1e-5
+ )
+ if testing:
+ self.dropout.train()
+ x_normalized = self.dropout(x_normalized)
+ cos_dist = self.cls_score(x_normalized)
+ scores = self.scale * cos_dist
+ proposal_deltas = self.bbox_pred(reg_x)
+ return scores, proposal_deltas
+ def inference(self, predictions: List[Tuple[torch.Tensor, torch.Tensor]], proposals: List[Instances]):
+ """
+ Args:
+ predictions: return values of :meth:`forward()`.
+ proposals (list[Instances]): proposals that match the features that were
+ used to compute predictions. The ``proposal_boxes`` field is expected.
+ Returns:
+ list[Instances]: same as `fast_rcnn_inference`.
+ list[Tensor]: same as `fast_rcnn_inference`.
+ """
+ boxes = self.predict_boxes(predictions[0], proposals)
+ scores = self.predict_probs(predictions, proposals)
+ image_shapes = [x.image_size for x in proposals]
+ return fast_rcnn_inference(
+ boxes,
+ scores,
+ image_shapes,
+ self.test_score_thresh,
+ self.test_nms_thresh,
+ self.test_topk_per_image,
+ )
+ def predict_probs(
+ self, predictions: List[Tuple[torch.Tensor, torch.Tensor]], proposals: List[Instances]
+ ):
+ """
+ Args:
+ predictions: return values of :meth:`forward()`.
+ proposals (list[Instances]): proposals that match the features that were
+ used to compute predictions.
+ Returns:
+ list[Tensor]:
+ A list of Tensors of predicted class probabilities for each image.
+ Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
+ """
+ # mean of multiple observations
+ scores = torch.stack([pred[0] for pred in predictions], dim=-1)
+ scores = scores.mean(dim=-1)
+ # threshlod by entropy
+ norm_entropy = dists.Categorical(scores.softmax(
+ dim=1)).entropy() / np.log(self.num_classes)
+ inds = norm_entropy > self.entropy_thr
+ max_scores = scores.max(dim=1)[0]
+ # set those with high entropy unknown objects
+ scores[inds, :] = 0.0
+ scores[inds, self.num_classes-1] = max_scores[inds]
+ num_inst_per_image = [len(p) for p in proposals]
+ probs = F.softmax(scores, dim=-1)
+ return probs.split(num_inst_per_image, dim=0)
diff --git a/opendet2/modeling/roi_heads/roi_heads.py b/opendet2/modeling/roi_heads/roi_heads.py
new file mode 100644
index 0000000..7d1310d
--- /dev/null
+++ b/opendet2/modeling/roi_heads/roi_heads.py
@@ -0,0 +1,150 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Dict, List
+import numpy as np
+import torch
+import torch.nn.functional as F
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.roi_heads.box_head import build_box_head
+from detectron2.modeling.roi_heads.roi_heads import (
+ ROI_HEADS_REGISTRY, StandardROIHeads, add_ground_truth_to_proposals)
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+from torch import nn
+from .fast_rcnn import build_roi_box_output_layers
+logger = logging.getLogger(__name__)
+class OpenSetStandardROIHeads(StandardROIHeads):
+ @torch.no_grad()
+ def label_and_sample_proposals(self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]:
+ if self.proposal_append_gt:
+ proposals = add_ground_truth_to_proposals(targets, proposals)
+ proposals_with_gt = []
+ num_fg_samples = []
+ num_bg_samples = []
+ for proposals_per_image, targets_per_image in zip(proposals, targets):
+ has_gt = len(targets_per_image) > 0
+ match_quality_matrix = pairwise_iou(
+ targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+ )
+ matched_idxs, matched_labels = self.proposal_matcher(
+ match_quality_matrix)
+ sampled_idxs, gt_classes = self._sample_proposals(
+ matched_idxs, matched_labels, targets_per_image.gt_classes
+ )
+ # Set target attributes of the sampled proposals:
+ proposals_per_image = proposals_per_image[sampled_idxs]
+ proposals_per_image.gt_classes = gt_classes
+ # NOTE: add iou of each proposal
+ ious, _ = match_quality_matrix.max(dim=0)
+ proposals_per_image.iou = ious[sampled_idxs]
+ if has_gt:
+ sampled_targets = matched_idxs[sampled_idxs]
+ for (trg_name, trg_value) in targets_per_image.get_fields().items():
+ if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
+ proposals_per_image.set(
+ trg_name, trg_value[sampled_targets])
+ num_bg_samples.append(
+ (gt_classes == self.num_classes).sum().item())
+ num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+ proposals_with_gt.append(proposals_per_image)
+ # Log the number of fg/bg samples that are selected for training ROI heads
+ storage = get_event_storage()
+ storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+ storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+ return proposals_with_gt
+ @classmethod
+ def _init_box_head(cls, cfg, input_shape):
+ # fmt: off
+ in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+ pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+ pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
+ pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+ # fmt: on
+ # If StandardROIHeads is applied on multiple feature maps (as in FPN),
+ # then we share the same predictors and therefore the channel counts must be the same
+ in_channels = [input_shape[f].channels for f in in_features]
+ # Check all channel counts are equal
+ assert len(set(in_channels)) == 1, in_channels
+ in_channels = in_channels[0]
+ box_pooler = ROIPooler(
+ output_size=pooler_resolution,
+ scales=pooler_scales,
+ sampling_ratio=sampling_ratio,
+ pooler_type=pooler_type,
+ )
+ box_head = build_box_head(
+ cfg, ShapeSpec(channels=in_channels,
+ height=pooler_resolution, width=pooler_resolution)
+ )
+ # register output layers
+ box_predictor = build_roi_box_output_layers(cfg, box_head.output_shape)
+ return {
+ "box_in_features": in_features,
+ "box_pooler": box_pooler,
+ "box_head": box_head,
+ "box_predictor": box_predictor,
+ }
+class DropoutStandardROIHeads(OpenSetStandardROIHeads):
+ @configurable
+ def __init__(self, *args, **kwargs,):
+ super().__init__(*args, **kwargs)
+ # num of sampling
+ self.num_sample = 30
+ def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances], targets=None):
+ features = [features[f] for f in self.box_in_features]
+ box_features = self.box_pooler(
+ features, [x.proposal_boxes for x in proposals])
+ box_features = self.box_head(box_features)
+ # if testing, we run multiple inference for dropout sampling
+ if self.training:
+ predictions = self.box_predictor(box_features)
+ else:
+ predictions = [self.box_predictor(
+ box_features, testing=True) for _ in range(self.num_sample)]
+ del box_features
+ if self.training:
+ losses = self.box_predictor.losses(predictions, proposals)
+ # proposals is modified in-place below, so losses must be computed first.
+ if self.train_on_pred_boxes:
+ with torch.no_grad():
+ pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
+ predictions, proposals
+ )
+ for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
+ proposals_per_image.proposal_boxes = Boxes(
+ pred_boxes_per_image)
+ return losses
+ else:
+ pred_instances, _ = self.box_predictor.inference(
+ predictions, proposals)
+ return pred_instances
diff --git a/opendet2/solver/__init__.py b/opendet2/solver/__init__.py
new file mode 100644
index 0000000..9bba8b7
--- /dev/null
+++ b/opendet2/solver/__init__.py
@@ -0,0 +1,3 @@
+from .build import *
+__all__ = list(globals().keys())
diff --git a/opendet2/solver/build.py b/opendet2/solver/build.py
new file mode 100644
index 0000000..00be765
--- /dev/null
+++ b/opendet2/solver/build.py
@@ -0,0 +1,57 @@
+from typing import Any, Dict, List, Set
+import torch
+from detectron2.config import CfgNode
+from detectron2.solver.build import maybe_add_gradient_clipping
+def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
+ """
+ Build an optimizer from config.
+ """
+ norm_module_types = (
+ torch.nn.BatchNorm1d,
+ torch.nn.BatchNorm2d,
+ torch.nn.BatchNorm3d,
+ torch.nn.SyncBatchNorm,
+ # NaiveSyncBatchNorm inherits from BatchNorm2d
+ torch.nn.GroupNorm,
+ torch.nn.InstanceNorm1d,
+ torch.nn.InstanceNorm2d,
+ torch.nn.InstanceNorm3d,
+ torch.nn.LayerNorm,
+ torch.nn.LocalResponseNorm,
+ )
+ params: List[Dict[str, Any]] = []
+ memo: Set[torch.nn.parameter.Parameter] = set()
+ for module in model.modules():
+ for key, value in module.named_parameters(recurse=False):
+ if not value.requires_grad:
+ continue
+ # Avoid duplicating parameters
+ if value in memo:
+ continue
+ memo.add(value)
+ lr = cfg.SOLVER.BASE_LR
+ weight_decay = cfg.SOLVER.WEIGHT_DECAY
+ if isinstance(module, norm_module_types):
+ weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
+ elif key == "bias":
+ # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0
+ # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer
+ # hyperparameters are by default exactly the same as for regular
+ # weights.
+ weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
+ params += [{"params": [value], "lr": lr,
+ "weight_decay": weight_decay}]
+ # To support AdamW for swin_transformer
+ optimizer = torch.optim.AdamW(
+ params, lr=cfg.SOLVER.BASE_LR, betas=cfg.SOLVER.BETAS, weight_decay=cfg.SOLVER.WEIGHT_DECAY)
+ else:
+ optimizer = torch.optim.SGD(
+ params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM)
+ optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+ return optimizer
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..015e7bf
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+from setuptools import setup
+ name="opendet2",
+ version=0.1,
+ author="csuhan",
+ url="https://github.com/csuhan/opendet2",
+ description="Codebase for open set object detection",
+ python_requires=">=3.6",
+ install_requires=[
+ 'timm', 'opencv-python'
+ ],
diff --git a/tools/convert_swin_to_d2.py b/tools/convert_swin_to_d2.py
new file mode 100644
index 0000000..2947e5d
--- /dev/null
+++ b/tools/convert_swin_to_d2.py
@@ -0,0 +1,36 @@
+import argparse
+import os
+import torch
+def parse_args():
+ parser = argparse.ArgumentParser("Convert Swin Transformer to Detectron2")
+ parser.add_argument("source_model", default="", type=str,
+ help="Source model")
+ parser.add_argument("output_model", default="", type=str,
+ help="Output model")
+ return parser.parse_args()
+def main():
+ args = parse_args()
+ if os.path.splitext(args.source_model)[-1] != ".pth":
+ raise ValueError("You should save weights as pth file")
+ source_weights = torch.load(
+ args.source_model, map_location=torch.device('cpu'))["model"]
+ converted_weights = {}
+ keys = list(source_weights.keys())
+ prefix = 'backbone.bottom_up.'
+ for key in keys:
+ converted_weights[prefix+key] = source_weights[key]
+ torch.save(converted_weights, args.output_model)
+if __name__ == "__main__":
+ main()
diff --git a/tools/train_net.py b/tools/train_net.py
new file mode 100644
index 0000000..cfe9cb2
--- /dev/null
+++ b/tools/train_net.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import (default_argument_parser, default_setup, hooks,
+ launch)
+from detectron2.evaluation import verify_results
+from detectron2.utils.logger import setup_logger
+from opendet2 import OpenDetTrainer, add_opendet_config, builtin
+def setup(args):
+ """
+ Create configs and perform basic setups.
+ """
+ cfg = get_cfg()
+ # add opendet config
+ add_opendet_config(cfg)
+ cfg.merge_from_file(args.config_file)
+ cfg.merge_from_list(args.opts)
+ # Note: we use the key ROI_HEAD.NUM_KNOWN_CLASSES
+ # for open-set data processing and evaluation.
+ if 'RetinaNet' in cfg.MODEL.META_ARCHITECTURE:
+ # add output dir if not exist
+ if cfg.OUTPUT_DIR == "./output":
+ config_name = os.path.basename(args.config_file).split(".yaml")[0]
+ cfg.OUTPUT_DIR = os.path.join(cfg.OUTPUT_DIR, config_name)
+ cfg.freeze()
+ default_setup(cfg, args)
+ setup_logger(output=cfg.OUTPUT_DIR,
+ distributed_rank=comm.get_rank(), name="opendet2")
+ return cfg
+def main(args):
+ cfg = setup(args)
+ if args.eval_only:
+ model = OpenDetTrainer.build_model(cfg)
+ DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+ cfg.MODEL.WEIGHTS, resume=args.resume
+ )
+ res = OpenDetTrainer.test(cfg, model)
+ res.update(OpenDetTrainer.test_with_TTA(cfg, model))
+ if comm.is_main_process():
+ verify_results(cfg, res)
+ return res
+ """
+ If you'd like to do anything fancier than the standard training logic,
+ consider writing your own training loop (see plain_train_net.py) or
+ subclassing the trainer.
+ """
+ trainer = OpenDetTrainer(cfg)
+ trainer.resume_or_load(resume=args.resume)
+ trainer.register_hooks(
+ [hooks.EvalHook(
+ 0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
+ )
+ return trainer.train()
+if __name__ == "__main__":
+ args = default_argument_parser().parse_args()
+ print("Command Line Args:", args)
+ launch(
+ main,
+ args.num_gpus,
+ num_machines=args.num_machines,
+ machine_rank=args.machine_rank,
+ dist_url=args.dist_url,
+ args=(args,),
+ )