From 9d1e164a7bdd0ca32cc77eea457d50f511595951 Mon Sep 17 00:00:00 2001 From: zhiwei Date: Fri, 18 Oct 2024 16:06:03 +0800 Subject: [PATCH] CBNet-EVA --- .../configs/common/models/cb_mask_rcnn_fpn.py | 95 +++ .../common/models/cb_mask_rcnn_vitdet.py | 87 +++ .../detectron2/modeling/backbone/__init__.py | 21 + .../detectron2/modeling/backbone/backbone.py | 74 ++ .../det/detectron2/modeling/backbone/build.py | 33 + .../detectron2/modeling/backbone/cb_vit.py | 254 +++++++ .../det/detectron2/modeling/backbone/fpn.py | 268 +++++++ .../det/detectron2/modeling/backbone/mvit.py | 446 +++++++++++ .../detectron2/modeling/backbone/regnet.py | 452 ++++++++++++ .../detectron2/modeling/backbone/resnet.py | 694 ++++++++++++++++++ .../det/detectron2/modeling/backbone/swin.py | 690 +++++++++++++++++ .../det/detectron2/modeling/backbone/utils.py | 492 +++++++++++++ .../det/detectron2/modeling/backbone/vit.py | 621 ++++++++++++++++ .../detectron2/modeling/meta_arch/__init__.py | 17 + .../detectron2/modeling/meta_arch/build.py | 25 + .../detectron2/modeling/meta_arch/cb_rcnn.py | 178 +++++ .../modeling/meta_arch/dense_detector.py | 289 ++++++++ .../det/detectron2/modeling/meta_arch/fcos.py | 328 +++++++++ .../modeling/meta_arch/panoptic_fpn.py | 269 +++++++ .../det/detectron2/modeling/meta_arch/rcnn.py | 389 ++++++++++ .../modeling/meta_arch/retinanet.py | 439 +++++++++++ .../modeling/meta_arch/semantic_seg.py | 267 +++++++ .../cascade_mask_rcnn_mvitv2_b_in21k_100ep.py | 95 +++ .../cascade_mask_rcnn_mvitv2_h_in21k_36ep.py | 39 + .../cascade_mask_rcnn_mvitv2_l_in21k_50ep.py | 22 + .../cascade_mask_rcnn_swin_b_in21k_50ep.py | 50 ++ .../cascade_mask_rcnn_swin_l_in21k_50ep.py | 15 + .../COCO/cascade_mask_rcnn_vitdet_1B_1536.py | 46 ++ .../COCO/cascade_mask_rcnn_vitdet_1B_75ep.py | 43 ++ .../COCO/cascade_mask_rcnn_vitdet_1B_attn.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_1280.py | 46 ++ .../cascade_mask_rcnn_vitdet_1B_attn_1408.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_1536.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_1664.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_1792.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_1920.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_2048.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_2176.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_2304.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_2432.py | 47 ++ .../cascade_mask_rcnn_vitdet_1B_attn_2560.py | 47 ++ .../COCO/cascade_mask_rcnn_vitdet_b_100ep.py | 48 ++ .../COCO/cascade_mask_rcnn_vitdet_b_ours.py | 51 ++ .../COCO/cascade_mask_rcnn_vitdet_h_75ep.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_1024.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_1280.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_1408.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_1536.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_1664.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_1792.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_1920.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_2048.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_2176.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_2304.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_2432.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_2560.py | 36 + .../cascade_mask_rcnn_vitdet_h_75ep_conv.py | 39 + .../COCO/cascade_mask_rcnn_vitdet_l_100ep.py | 23 + .../configs/COCO/mask_rcnn_vitdet_b_100ep.py | 38 + .../configs/COCO/mask_rcnn_vitdet_h_75ep.py | 31 + .../configs/COCO/mask_rcnn_vitdet_l_100ep.py | 23 + .../o365_cascade_mask_rcnn_vitdet_1B_attn.py | 70 ++ ...5_cascade_mask_rcnn_vitdet_1B_attn_1024.py | 70 ++ ...ade_mask_rcnn_vitdet_1B_attn_1024to1280.py | 70 ++ .../cascade_mask_rcnn_mvitv2_b_in21k_100ep.py | 48 ++ .../cascade_mask_rcnn_mvitv2_h_in21k_50ep.py | 25 + .../cascade_mask_rcnn_mvitv2_l_in21k_50ep.py | 24 + .../cascade_mask_rcnn_swin_b_in21k_50ep.py | 49 ++ .../cascade_mask_rcnn_swin_l_in21k_50ep.py | 12 + .../LVIS/cascade_mask_rcnn_vitdet_1B_attn.py | 84 +++ .../cascade_mask_rcnn_vitdet_1B_attn.py.bak | 84 +++ .../cascade_mask_rcnn_vitdet_1B_attn_1536.py | 84 +++ .../LVIS/cascade_mask_rcnn_vitdet_b_100ep.py | 51 ++ .../LVIS/cascade_mask_rcnn_vitdet_h_100ep.py | 51 ++ .../LVIS/cascade_mask_rcnn_vitdet_l_100ep.py | 51 ++ .../configs/LVIS/mask_rcnn_vitdet_b_100ep.py | 44 ++ .../configs/LVIS/mask_rcnn_vitdet_h_100ep.py | 28 + .../configs/LVIS/mask_rcnn_vitdet_l_100ep.py | 24 + .../Objects365/mask_rcnn_vitdet_b_100ep.py | 41 ++ .../ViTDet/configs/common/coco_loader_lsj.py | 22 + .../configs/common/coco_loader_lsj_1024.py | 22 + .../configs/common/coco_loader_lsj_1280.py | 22 + .../configs/common/coco_loader_lsj_1408.py | 22 + .../configs/common/coco_loader_lsj_1536.py | 22 + .../configs/common/coco_loader_lsj_1664.py | 22 + .../configs/common/coco_loader_lsj_1792.py | 22 + .../configs/common/coco_loader_lsj_1920.py | 22 + .../configs/common/coco_loader_lsj_2048.py | 22 + .../configs/common/coco_loader_lsj_2176.py | 22 + .../configs/common/coco_loader_lsj_2304.py | 22 + .../configs/common/coco_loader_lsj_2432.py | 22 + .../configs/common/coco_loader_lsj_2560.py | 22 + .../configs/common/objects365_loader_lsj.py | 22 + .../common/objects365_loader_lsj_1280.py | 22 + .../objects365_trainval_loader_lsj_1024.py | 22 + .../objects365_trainval_loader_lsj_1280.py | 22 + .../objects365_trainval_loader_lsj_1536.py | 22 + .../cascade_mask_rcnn_vitdet_b_100ep.py | 48 ++ .../cb_cascade_mask_rcnn_vitdet_b_100ep.py | 48 ++ .../cb_mask_rcnn_vitdet_b_100ep.py | 38 + ...k_rcnn_vitdet_b_6attn_win32_1536_lrd0p7.py | 45 ++ ...de_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py | 46 ++ ...k_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py | 46 ++ .../mask_rcnn_vitdet_b_100ep.py | 38 + .../cascade_mask_rcnn_vitdet_b_100ep.py | 48 ++ ...de_mask_rcnn_vitdet_b_4attn_1024_lrd0p7.py | 80 ++ ...de_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py | 79 ++ ...k_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py | 79 ++ .../mask_rcnn_vitdet_b_100ep.py | 38 + .../cascade_mask_rcnn_vitdet_b_100ep.py | 48 ++ ...de_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py | 62 ++ .../eva2_o365/mask_rcnn_vitdet_b_100ep.py | 38 + .../cascade_mask_rcnn_vitdet_b_100ep.py | 48 ++ .../cb_cascade_mask_rcnn_vitdet_b_100ep.py | 48 ++ ...de_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py | 91 +++ .../cb_mask_rcnn_vitdet_b_100ep.py | 38 + ...de_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py | 56 ++ .../mask_rcnn_vitdet_b_100ep.py | 38 + .../cascade_mask_rcnn_vitdet_b_100ep.py | 51 ++ ...de_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py | 71 ++ .../mask_rcnn_vitdet_b_100ep.py | 44 ++ EVA/README.md | 75 ++ EVA/get_cb_ckpt.py | 15 + README.md | 244 +++--- model_zoo.md | 8 +- 125 files changed, 10815 insertions(+), 121 deletions(-) create mode 100644 EVA/EVA-02/det/configs/common/models/cb_mask_rcnn_fpn.py create mode 100644 EVA/EVA-02/det/configs/common/models/cb_mask_rcnn_vitdet.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/__init__.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/backbone.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/build.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/cb_vit.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/fpn.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/mvit.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/regnet.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/resnet.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/swin.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/utils.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/backbone/vit.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/__init__.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/build.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/cb_rcnn.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/dense_detector.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/fcos.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/panoptic_fpn.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/rcnn.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/retinanet.py create mode 100644 EVA/EVA-02/det/detectron2/modeling/meta_arch/semantic_seg.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_h_in21k_36ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_swin_b_in21k_50ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_swin_l_in21k_50ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_1536.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_75ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1280.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1408.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1536.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1664.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1792.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1920.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2048.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2176.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2304.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2432.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2560.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_b_ours.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1024.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1280.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1408.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1536.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1664.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1792.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1920.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2048.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2176.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2304.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2432.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2560.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_conv.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_l_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_h_75ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_l_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn_1024.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn_1024to1280.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_h_in21k_50ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_swin_b_in21k_50ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_swin_l_in21k_50ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn.py.bak create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn_1536.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_h_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_l_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_h_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_l_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/Objects365/mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1024.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1280.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1408.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1536.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1664.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1792.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1920.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2048.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2176.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2304.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2432.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2560.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_loader_lsj.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_loader_lsj_1280.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1024.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1280.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1536.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cb_cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cb_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_b_6attn_win32_1536_lrd0p7.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_b_4attn_1024_lrd0p7.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/eva2_o365_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/cascade_mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/eva2_o365_to_lvis_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py create mode 100644 EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/mask_rcnn_vitdet_b_100ep.py create mode 100644 EVA/README.md create mode 100644 EVA/get_cb_ckpt.py mode change 100755 => 100644 README.md diff --git a/EVA/EVA-02/det/configs/common/models/cb_mask_rcnn_fpn.py b/EVA/EVA-02/det/configs/common/models/cb_mask_rcnn_fpn.py new file mode 100644 index 00000000..ac1e200e --- /dev/null +++ b/EVA/EVA-02/det/configs/common/models/cb_mask_rcnn_fpn.py @@ -0,0 +1,95 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.meta_arch import CBGeneralizedRCNN +from detectron2.modeling.anchor_generator import DefaultAnchorGenerator +from detectron2.modeling.backbone.fpn import LastLevelMaxPool +from detectron2.modeling.backbone import BasicStem, FPN, ResNet +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.poolers import ROIPooler +from detectron2.modeling.proposal_generator import RPN, StandardRPNHead +from detectron2.modeling.roi_heads import ( + StandardROIHeads, + FastRCNNOutputLayers, + MaskRCNNConvUpsampleHead, + FastRCNNConvFCHead, +) + +from ..data.constants import constants + +model = L(CBGeneralizedRCNN)( + backbone=L(FPN)( + bottom_up=L(ResNet)( + stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"), + stages=L(ResNet.make_default_stages)( + depth=50, + stride_in_1x1=True, + norm="FrozenBN", + ), + out_features=["res2", "res3", "res4", "res5"], + ), + in_features="${.bottom_up.out_features}", + out_channels=256, + top_block=L(LastLevelMaxPool)(), + ), + proposal_generator=L(RPN)( + in_features=["p2", "p3", "p4", "p5", "p6"], + head=L(StandardRPNHead)(in_channels=256, num_anchors=3), + anchor_generator=L(DefaultAnchorGenerator)( + sizes=[[32], [64], [128], [256], [512]], + aspect_ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64], + offset=0.0, + ), + anchor_matcher=L(Matcher)( + thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True + ), + box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]), + batch_size_per_image=256, + positive_fraction=0.5, + pre_nms_topk=(2000, 1000), + post_nms_topk=(1000, 1000), + nms_thresh=0.7, + ), + roi_heads=L(StandardROIHeads)( + num_classes=80, + batch_size_per_image=512, + positive_fraction=0.25, + proposal_matcher=L(Matcher)( + thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False + ), + box_in_features=["p2", "p3", "p4", "p5"], + box_pooler=L(ROIPooler)( + output_size=7, + scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32), + sampling_ratio=0, + pooler_type="ROIAlignV2", + ), + box_head=L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[], + fc_dims=[1024, 1024], + ), + box_predictor=L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)), + num_classes="${..num_classes}", + ), + mask_in_features=["p2", "p3", "p4", "p5"], + mask_pooler=L(ROIPooler)( + output_size=14, + scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32), + sampling_ratio=0, + pooler_type="ROIAlignV2", + ), + mask_head=L(MaskRCNNConvUpsampleHead)( + input_shape=ShapeSpec(channels=256, width=14, height=14), + num_classes="${..num_classes}", + conv_dims=[256, 256, 256, 256, 256], + ), + ), + pixel_mean=constants.imagenet_bgr256_mean, + pixel_std=constants.imagenet_bgr256_std, + input_format="BGR", +) diff --git a/EVA/EVA-02/det/configs/common/models/cb_mask_rcnn_vitdet.py b/EVA/EVA-02/det/configs/common/models/cb_mask_rcnn_vitdet.py new file mode 100644 index 00000000..f546936f --- /dev/null +++ b/EVA/EVA-02/det/configs/common/models/cb_mask_rcnn_vitdet.py @@ -0,0 +1,87 @@ +from functools import partial +import torch.nn as nn +from detectron2.config import LazyCall as L +from detectron2.modeling import CBViT, CBSimpleFeaturePyramid +from detectron2.modeling.backbone.fpn import LastLevelMaxPool + +from .cb_mask_rcnn_fpn import model +from ..data.constants import constants + +model.pixel_mean = constants.imagenet_rgb256_mean +model.pixel_std = constants.imagenet_rgb256_std +model.input_format = "RGB" + +# from apex.normalization import FusedLayerNorm + +# Base +embed_dim, depth, num_heads, dp = 768, 12, 12, 0.1 +# Creates Simple Feature Pyramid from ViT backbone +model.backbone = L(CBSimpleFeaturePyramid)( + net=L(CBViT)( # Single-scale ViT backbone + img_size=1024, + patch_size=16, + embed_dim=embed_dim, + depth=depth, + num_heads=num_heads, + drop_path_rate=dp, + window_size=14, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + window_block_indexes=[ + # 2, 5, 8 11 for global attention + 0, + 1, + 3, + 4, + 6, + 7, + 9, + 10, + ], + residual_block_indexes=[], + use_rel_pos=True, + out_feature="last_feat", + ), + cb_net=L(CBViT)( # Single-scale ViT backbone + img_size=1024, + patch_size=16, + embed_dim=embed_dim, + depth=depth, + num_heads=num_heads, + drop_path_rate=dp, + window_size=14, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + window_block_indexes=[ + # 2, 5, 8 11 for global attention + 0, + 1, + 3, + 4, + 6, + 7, + 9, + 10, + ], + residual_block_indexes=[], + use_rel_pos=True, + out_feature="last_feat", + ), + in_feature="${.net.out_feature}", + out_channels=256, + scale_factors=(4.0, 2.0, 1.0, 0.5), + top_block=L(LastLevelMaxPool)(), + norm="LN", + square_pad=1024, +) + +model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN" + +# 2conv in RPN: +model.proposal_generator.head.conv_dims = [-1, -1] + +# 4conv1fc box head +model.roi_heads.box_head.conv_dims = [256, 256, 256, 256] +model.roi_heads.box_head.fc_dims = [1024] diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/__init__.py b/EVA/EVA-02/det/detectron2/modeling/backbone/__init__.py new file mode 100644 index 00000000..15924273 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from .build import build_backbone, BACKBONE_REGISTRY # noqa F401 isort:skip + +from .backbone import Backbone +from .fpn import FPN +from .regnet import RegNet +from .resnet import ( + BasicStem, + ResNet, + ResNetBlockBase, + build_resnet_backbone, + make_stage, + BottleneckBlock, +) +from .vit import ViT, SimpleFeaturePyramid, get_vit_lr_decay_rate +from .mvit import MViT +from .swin import SwinTransformer +from .cb_vit import CBViT, CBSimpleFeaturePyramid + +__all__ = [k for k in globals().keys() if not k.startswith("_")] +# TODO can expose more resnet blocks after careful consideration diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/backbone.py b/EVA/EVA-02/det/detectron2/modeling/backbone/backbone.py new file mode 100644 index 00000000..e1c765a6 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/backbone.py @@ -0,0 +1,74 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from abc import ABCMeta, abstractmethod +from typing import Dict +import torch.nn as nn + +from detectron2.layers import ShapeSpec + +__all__ = ["Backbone"] + + +class Backbone(nn.Module, metaclass=ABCMeta): + """ + Abstract base class for network backbones. + """ + + def __init__(self): + """ + The `__init__` method of any subclass can specify its own set of arguments. + """ + super().__init__() + + @abstractmethod + def forward(self): + """ + Subclasses must override this method, but adhere to the same return type. + + Returns: + dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor + """ + pass + + @property + def size_divisibility(self) -> int: + """ + Some backbones require the input height and width to be divisible by a + specific integer. This is typically true for encoder / decoder type networks + with lateral connection (e.g., FPN) for which feature maps need to match + dimension in the "bottom up" and "top down" paths. Set to 0 if no specific + input size divisibility is required. + """ + return 0 + + @property + def padding_constraints(self) -> Dict[str, int]: + """ + This property is a generalization of size_divisibility. Some backbones and training + recipes require specific padding constraints, such as enforcing divisibility by a specific + integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter + in :paper:vitdet). `padding_constraints` contains these optional items like: + { + "size_divisibility": int, + "square_size": int, + # Future options are possible + } + `size_divisibility` will read from here if presented and `square_size` indicates the + square padding size if `square_size` > 0. + + TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints + could be generalized as TypedDict (Python 3.8+) to support more types in the future. + """ + return {} + + def output_shape(self): + """ + Returns: + dict[str->ShapeSpec] + """ + # this is a backward-compatible default + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/build.py b/EVA/EVA-02/det/detectron2/modeling/backbone/build.py new file mode 100644 index 00000000..af021411 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/build.py @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from detectron2.layers import ShapeSpec +from detectron2.utils.registry import Registry + +from .backbone import Backbone + +BACKBONE_REGISTRY = Registry("BACKBONE") +BACKBONE_REGISTRY.__doc__ = """ +Registry for backbones, which extract feature maps from images + +The registered object must be a callable that accepts two arguments: + +1. A :class:`detectron2.config.CfgNode` +2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification. + +Registered object must return instance of :class:`Backbone`. +""" + + +def build_backbone(cfg, input_shape=None): + """ + Build a backbone from `cfg.MODEL.BACKBONE.NAME`. + + Returns: + an instance of :class:`Backbone` + """ + if input_shape is None: + input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) + + backbone_name = cfg.MODEL.BACKBONE.NAME + backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape) + assert isinstance(backbone, Backbone) + return backbone diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/cb_vit.py b/EVA/EVA-02/det/detectron2/modeling/backbone/cb_vit.py new file mode 100644 index 00000000..4a969902 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/cb_vit.py @@ -0,0 +1,254 @@ +import logging +import math +from functools import partial + +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn as nn +import torch.nn.functional as F + +from detectron2.layers import CNNBlockBase, Conv2d, get_norm +from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous + +from .backbone import Backbone +from .utils import ( + PatchEmbed, + add_decomposed_rel_pos, + get_abs_pos, + window_partition, + window_unpartition, + VisionRotaryEmbeddingFast, +) + +try: + import xformers.ops as xops +except: + pass + +try: + from apex.normalization import FusedLayerNorm +except: + pass + + +logger = logging.getLogger(__name__) +from mmcv.cnn import constant_init + +from .vit import ViT, SimpleFeaturePyramid +__all__ = ["CBViT", "CBSimpleFeaturePyramid"] + + + +class CBViT(ViT): + """ + This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. + "Exploring Plain Vision Transformer Backbones for Object Detection", + https://arxiv.org/abs/2203.16527 + """ + + def __init__( + self, + cb_out_index, + del_patch_embed=True, + **kwargs + ): + super().__init__(**kwargs) + self.cb_out_index = cb_out_index + if del_patch_embed: + del self.patch_embed + del self.pos_embed + self.patch_embed = None + self.pos_embed = None + + assert cb_out_index[-1]+1 == len(self.blocks) + + def forward(self, x, cb_feats=None): + assert cb_feats is not None or self.patch_embed is not None + if self.patch_embed is None: + x = cb_feats[0] + cb_feats[1] + # print(len(cb_feats)) + else: + x = self.patch_embed(x) + if self.pos_embed is not None: + x = x + get_abs_pos( + self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2]) + ) + + cb_output = [x] + j = 2 + # print(len(cb_feats)) + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in self.cb_out_index: + cb_output.append(x) + if cb_feats is not None and i != self.cb_out_index[-1]: + x = x + cb_feats[j] + j += 1 + # print(i, j) + # assert j == len(self.cb_out_index) + 1 + # print(j, len(self.cb_out_index)) + + outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)} + + return outputs, cb_output + + +class CBSimpleFeaturePyramid(SimpleFeaturePyramid): + """ + This module implements SimpleFeaturePyramid in :paper:`vitdet`. + It creates pyramid features built on top of the input feature map. + """ + + def __init__( + self, + net, + cb_net, + in_feature, + out_channels, + scale_factors, + top_block=None, + norm="LN", + square_pad=0, + ): + super(SimpleFeaturePyramid, self).__init__() + assert isinstance(net, Backbone) + + self.scale_factors = scale_factors + + input_shapes = net.output_shape() + strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors] + _assert_strides_are_log2_contiguous(strides) + + dim = input_shapes[in_feature].channels + self.stages = [] + use_bias = norm == "" + for idx, scale in enumerate(scale_factors): + out_dim = dim + if scale == 4.0: + layers = [ + nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2), + get_norm(norm, dim // 2), + nn.GELU(), + nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2), + ] + out_dim = dim // 4 + elif scale == 2.0: + layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)] + out_dim = dim // 2 + elif scale == 1.0: + layers = [] + elif scale == 0.5: + layers = [nn.MaxPool2d(kernel_size=2, stride=2)] + else: + raise NotImplementedError(f"scale_factor={scale} is not supported yet.") + + layers.extend( + [ + Conv2d( + out_dim, + out_channels, + kernel_size=1, + bias=use_bias, + norm=get_norm(norm, out_channels), + ), + Conv2d( + out_channels, + out_channels, + kernel_size=3, + padding=1, + bias=use_bias, + norm=get_norm(norm, out_channels), + ), + ] + ) + layers = nn.Sequential(*layers) + + stage = int(math.log2(strides[idx])) + self.add_module(f"simfp_{stage}", layers) + self.stages.append(layers) + + self.net = net + self.cb_net = cb_net + + self.in_feature = in_feature + self.top_block = top_block + # Return feature names are "p", like ["p2", "p3", ..., "p6"] + self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} + # top block output feature maps. + if self.top_block is not None: + for s in range(stage, stage + self.top_block.num_levels): + self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) + + self._out_features = list(self._out_feature_strides.keys()) + self._out_feature_channels = {k: out_channels for k in self._out_features} + self._size_divisibility = strides[-1] + self._square_pad = square_pad + + self.cb_linears = nn.ModuleList() + for i in range(4): + linears = nn.ModuleList() + jrange = 4 - i + for j in range(jrange): + linears.append(nn.Conv2d(net.embed_dim, net.embed_dim, 1)) + self.cb_linears.append(linears) + + self.init_cb_weights() + + def init_cb_weights(self): + for ls in self.cb_linears: + for m in ls: + # if isinstance(m, nn.Sequential): + # constant_init(m[-1], 0) + # # torch.nn.init.constant(m, val) + # else: + constant_init(m, 0) + + def forward_neck(self, bottom_up_features): + features = bottom_up_features[self.in_feature] + results = [] + + for stage in self.stages: + results.append(stage(features)) + + if self.top_block is not None: + if self.top_block.in_feature in bottom_up_features: + top_block_in_feature = bottom_up_features[self.top_block.in_feature] + else: + top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] + results.extend(self.top_block(top_block_in_feature)) + assert len(self._out_features) == len(results) + return {f: res for f, res in zip(self._out_features, results)} + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + + Returns: + dict[str->Tensor]: + mapping from feature map name to pyramid feature map tensor + in high to low resolution order. Returned feature names follow the FPN + convention: "p", where stage has stride = 2 ** stage e.g., + ["p2", "p3", ..., "p6"]. + """ + bottom_up_features, cb_output = self.net(x) + results = self.forward_neck(bottom_up_features) + + cb_feats = [cb_output[0]] + for i in range(4): + feats = 0 + jrange = 4 - i + for j in range(jrange): + feats += self.cb_linears[i][j](cb_output[i+1].permute(0, 3, 1, 2)) + cb_feats.append(feats.permute(0, 2, 3, 1)) + + cb_bottom_up_features, _ = self.cb_net(x, cb_feats) + cb_results = self.forward_neck(cb_bottom_up_features) + + return [results, cb_results] + + + + + + diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/fpn.py b/EVA/EVA-02/det/detectron2/modeling/backbone/fpn.py new file mode 100644 index 00000000..19d24e13 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/fpn.py @@ -0,0 +1,268 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import math +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn.functional as F +from torch import nn + +from detectron2.layers import Conv2d, ShapeSpec, get_norm + +from .backbone import Backbone +from .build import BACKBONE_REGISTRY +from .resnet import build_resnet_backbone + +__all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"] + + +class FPN(Backbone): + """ + This module implements :paper:`FPN`. + It creates pyramid features built on top of some input feature maps. + """ + + _fuse_type: torch.jit.Final[str] + + def __init__( + self, + bottom_up, + in_features, + out_channels, + norm="", + top_block=None, + fuse_type="sum", + square_pad=0, + ): + """ + Args: + bottom_up (Backbone): module representing the bottom up subnetwork. + Must be a subclass of :class:`Backbone`. The multi-scale feature + maps generated by the bottom up network, and listed in `in_features`, + are used to generate FPN levels. + in_features (list[str]): names of the input feature maps coming + from the backbone to which FPN is attached. For example, if the + backbone produces ["res2", "res3", "res4"], any *contiguous* sublist + of these may be used; order must be from high to low resolution. + out_channels (int): number of channels in the output feature maps. + norm (str): the normalization to use. + top_block (nn.Module or None): if provided, an extra operation will + be performed on the output of the last (smallest resolution) + FPN output, and the result will extend the result list. The top_block + further downsamples the feature map. It must have an attribute + "num_levels", meaning the number of extra FPN levels added by + this block, and "in_feature", which is a string representing + its input feature (e.g., p5). + fuse_type (str): types for fusing the top down features and the lateral + ones. It can be "sum" (default), which sums up element-wise; or "avg", + which takes the element-wise mean of the two. + square_pad (int): If > 0, require input images to be padded to specific square size. + """ + super(FPN, self).__init__() + assert isinstance(bottom_up, Backbone) + assert in_features, in_features + + # Feature map strides and channels from the bottom up network (e.g. ResNet) + input_shapes = bottom_up.output_shape() + strides = [input_shapes[f].stride for f in in_features] + in_channels_per_feature = [input_shapes[f].channels for f in in_features] + + _assert_strides_are_log2_contiguous(strides) + lateral_convs = [] + output_convs = [] + + use_bias = norm == "" + for idx, in_channels in enumerate(in_channels_per_feature): + lateral_norm = get_norm(norm, out_channels) + output_norm = get_norm(norm, out_channels) + + lateral_conv = Conv2d( + in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm + ) + output_conv = Conv2d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + ) + weight_init.c2_xavier_fill(lateral_conv) + weight_init.c2_xavier_fill(output_conv) + stage = int(math.log2(strides[idx])) + self.add_module("fpn_lateral{}".format(stage), lateral_conv) + self.add_module("fpn_output{}".format(stage), output_conv) + + lateral_convs.append(lateral_conv) + output_convs.append(output_conv) + # Place convs into top-down order (from low to high resolution) + # to make the top-down computation in forward clearer. + self.lateral_convs = lateral_convs[::-1] + self.output_convs = output_convs[::-1] + self.top_block = top_block + self.in_features = tuple(in_features) + self.bottom_up = bottom_up + # Return feature names are "p", like ["p2", "p3", ..., "p6"] + self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} + # top block output feature maps. + if self.top_block is not None: + for s in range(stage, stage + self.top_block.num_levels): + self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) + + self._out_features = list(self._out_feature_strides.keys()) + self._out_feature_channels = {k: out_channels for k in self._out_features} + self._size_divisibility = strides[-1] + self._square_pad = square_pad + assert fuse_type in {"avg", "sum"} + self._fuse_type = fuse_type + + @property + def size_divisibility(self): + return self._size_divisibility + + @property + def padding_constraints(self): + return {"square_size": self._square_pad} + + def forward(self, x): + """ + Args: + input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to + feature map tensor for each feature level in high to low resolution order. + + Returns: + dict[str->Tensor]: + mapping from feature map name to FPN feature map tensor + in high to low resolution order. Returned feature names follow the FPN + paper convention: "p", where stage has stride = 2 ** stage e.g., + ["p2", "p3", ..., "p6"]. + """ + bottom_up_features = self.bottom_up(x) + results = [] + prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]]) + results.append(self.output_convs[0](prev_features)) + + # Reverse feature maps into top-down order (from low to high resolution) + for idx, (lateral_conv, output_conv) in enumerate( + zip(self.lateral_convs, self.output_convs) + ): + # Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336 + # Therefore we loop over all modules but skip the first one + if idx > 0: + features = self.in_features[-idx - 1] + features = bottom_up_features[features] + top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest") + lateral_features = lateral_conv(features) + prev_features = lateral_features + top_down_features + if self._fuse_type == "avg": + prev_features /= 2 + results.insert(0, output_conv(prev_features)) + + if self.top_block is not None: + if self.top_block.in_feature in bottom_up_features: + top_block_in_feature = bottom_up_features[self.top_block.in_feature] + else: + top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] + results.extend(self.top_block(top_block_in_feature)) + assert len(self._out_features) == len(results) + return {f: res for f, res in zip(self._out_features, results)} + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } + + +def _assert_strides_are_log2_contiguous(strides): + """ + Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2". + """ + for i, stride in enumerate(strides[1:], 1): + assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format( + stride, strides[i - 1] + ) + + +class LastLevelMaxPool(nn.Module): + """ + This module is used in the original FPN to generate a downsampled + P6 feature from P5. + """ + + def __init__(self): + super().__init__() + self.num_levels = 1 + self.in_feature = "p5" + + def forward(self, x): + return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)] + + +class LastLevelP6P7(nn.Module): + """ + This module is used in RetinaNet to generate extra layers, P6 and P7 from + C5 feature. + """ + + def __init__(self, in_channels, out_channels, in_feature="res5"): + super().__init__() + self.num_levels = 2 + self.in_feature = in_feature + self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) + self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) + for module in [self.p6, self.p7]: + weight_init.c2_xavier_fill(module) + + def forward(self, c5): + p6 = self.p6(c5) + p7 = self.p7(F.relu(p6)) + return [p6, p7] + + +@BACKBONE_REGISTRY.register() +def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): + """ + Args: + cfg: a detectron2 CfgNode + + Returns: + backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. + """ + bottom_up = build_resnet_backbone(cfg, input_shape) + in_features = cfg.MODEL.FPN.IN_FEATURES + out_channels = cfg.MODEL.FPN.OUT_CHANNELS + backbone = FPN( + bottom_up=bottom_up, + in_features=in_features, + out_channels=out_channels, + norm=cfg.MODEL.FPN.NORM, + top_block=LastLevelMaxPool(), + fuse_type=cfg.MODEL.FPN.FUSE_TYPE, + ) + return backbone + + +@BACKBONE_REGISTRY.register() +def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): + """ + Args: + cfg: a detectron2 CfgNode + + Returns: + backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. + """ + bottom_up = build_resnet_backbone(cfg, input_shape) + in_features = cfg.MODEL.FPN.IN_FEATURES + out_channels = cfg.MODEL.FPN.OUT_CHANNELS + in_channels_p6p7 = bottom_up.output_shape()["res5"].channels + backbone = FPN( + bottom_up=bottom_up, + in_features=in_features, + out_channels=out_channels, + norm=cfg.MODEL.FPN.NORM, + top_block=LastLevelP6P7(in_channels_p6p7, out_channels), + fuse_type=cfg.MODEL.FPN.FUSE_TYPE, + ) + return backbone diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/mvit.py b/EVA/EVA-02/det/detectron2/modeling/backbone/mvit.py new file mode 100644 index 00000000..d3477b8b --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/mvit.py @@ -0,0 +1,446 @@ +import logging +import numpy as np +import torch +import torch.nn as nn + +from fairscale.nn.checkpoint import checkpoint_wrapper +from timm.models.layers import DropPath, Mlp, trunc_normal_ + +from .backbone import Backbone +from .utils import ( + PatchEmbed, + add_decomposed_rel_pos, + get_abs_pos, + window_partition, + window_unpartition, +) + +logger = logging.getLogger(__name__) + + +__all__ = ["MViT"] + + +def attention_pool(x, pool, norm=None): + # (B, H, W, C) -> (B, C, H, W) + x = x.permute(0, 3, 1, 2) + x = pool(x) + # (B, C, H1, W1) -> (B, H1, W1, C) + x = x.permute(0, 2, 3, 1) + if norm: + x = norm(x) + + return x + + +class MultiScaleAttention(nn.Module): + """Multiscale Multi-head Attention block.""" + + def __init__( + self, + dim, + dim_out, + num_heads, + qkv_bias=True, + norm_layer=nn.LayerNorm, + pool_kernel=(3, 3), + stride_q=1, + stride_kv=1, + residual_pooling=True, + window_size=0, + use_rel_pos=False, + rel_pos_zero_init=True, + input_size=None, + ): + """ + Args: + dim (int): Number of input channels. + dim_out (int): Number of output channels. + num_heads (int): Number of attention heads. + qkv_bias (bool: If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + pool_kernel (tuple): kernel size for qkv pooling layers. + stride_q (int): stride size for q pooling layer. + stride_kv (int): stride size for kv pooling layer. + residual_pooling (bool): If true, enable residual pooling. + use_rel_pos (bool): If True, add relative postional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + input_size (int or None): Input resolution. + """ + super().__init__() + self.num_heads = num_heads + head_dim = dim_out // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias) + self.proj = nn.Linear(dim_out, dim_out) + + # qkv pooling + pool_padding = [k // 2 for k in pool_kernel] + dim_conv = dim_out // num_heads + self.pool_q = nn.Conv2d( + dim_conv, + dim_conv, + pool_kernel, + stride=stride_q, + padding=pool_padding, + groups=dim_conv, + bias=False, + ) + self.norm_q = norm_layer(dim_conv) + self.pool_k = nn.Conv2d( + dim_conv, + dim_conv, + pool_kernel, + stride=stride_kv, + padding=pool_padding, + groups=dim_conv, + bias=False, + ) + self.norm_k = norm_layer(dim_conv) + self.pool_v = nn.Conv2d( + dim_conv, + dim_conv, + pool_kernel, + stride=stride_kv, + padding=pool_padding, + groups=dim_conv, + bias=False, + ) + self.norm_v = norm_layer(dim_conv) + + self.window_size = window_size + if window_size: + self.q_win_size = window_size // stride_q + self.kv_win_size = window_size // stride_kv + self.residual_pooling = residual_pooling + + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + # initialize relative positional embeddings + assert input_size[0] == input_size[1] + size = input_size[0] + rel_dim = 2 * max(size // stride_q, size // stride_kv) - 1 + self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim)) + + if not rel_pos_zero_init: + trunc_normal_(self.rel_pos_h, std=0.02) + trunc_normal_(self.rel_pos_w, std=0.02) + + def forward(self, x): + B, H, W, _ = x.shape + # qkv with shape (3, B, nHead, H, W, C) + qkv = self.qkv(x).reshape(B, H, W, 3, self.num_heads, -1).permute(3, 0, 4, 1, 2, 5) + # q, k, v with shape (B * nHead, H, W, C) + q, k, v = qkv.reshape(3, B * self.num_heads, H, W, -1).unbind(0) + + q = attention_pool(q, self.pool_q, self.norm_q) + k = attention_pool(k, self.pool_k, self.norm_k) + v = attention_pool(v, self.pool_v, self.norm_v) + + ori_q = q + if self.window_size: + q, q_hw_pad = window_partition(q, self.q_win_size) + k, kv_hw_pad = window_partition(k, self.kv_win_size) + v, _ = window_partition(v, self.kv_win_size) + q_hw = (self.q_win_size, self.q_win_size) + kv_hw = (self.kv_win_size, self.kv_win_size) + else: + q_hw = q.shape[1:3] + kv_hw = k.shape[1:3] + + q = q.view(q.shape[0], np.prod(q_hw), -1) + k = k.view(k.shape[0], np.prod(kv_hw), -1) + v = v.view(v.shape[0], np.prod(kv_hw), -1) + + attn = (q * self.scale) @ k.transpose(-2, -1) + + if self.use_rel_pos: + attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, q_hw, kv_hw) + + attn = attn.softmax(dim=-1) + x = attn @ v + + x = x.view(x.shape[0], q_hw[0], q_hw[1], -1) + + if self.window_size: + x = window_unpartition(x, self.q_win_size, q_hw_pad, ori_q.shape[1:3]) + + if self.residual_pooling: + x += ori_q + + H, W = x.shape[1], x.shape[2] + x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) + x = self.proj(x) + + return x + + +class MultiScaleBlock(nn.Module): + """Multiscale Transformer blocks""" + + def __init__( + self, + dim, + dim_out, + num_heads, + mlp_ratio=4.0, + qkv_bias=True, + drop_path=0.0, + norm_layer=nn.LayerNorm, + act_layer=nn.GELU, + qkv_pool_kernel=(3, 3), + stride_q=1, + stride_kv=1, + residual_pooling=True, + window_size=0, + use_rel_pos=False, + rel_pos_zero_init=True, + input_size=None, + ): + """ + Args: + dim (int): Number of input channels. + dim_out (int): Number of output channels. + num_heads (int): Number of attention heads in the MViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + drop_path (float): Stochastic depth rate. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + qkv_pool_kernel (tuple): kernel size for qkv pooling layers. + stride_q (int): stride size for q pooling layer. + stride_kv (int): stride size for kv pooling layer. + residual_pooling (bool): If true, enable residual pooling. + window_size (int): Window size for window attention blocks. If it equals 0, then not + use window attention. + use_rel_pos (bool): If True, add relative postional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + input_size (int or None): Input resolution. + """ + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = MultiScaleAttention( + dim, + dim_out, + num_heads=num_heads, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + pool_kernel=qkv_pool_kernel, + stride_q=stride_q, + stride_kv=stride_kv, + residual_pooling=residual_pooling, + window_size=window_size, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim_out) + self.mlp = Mlp( + in_features=dim_out, + hidden_features=int(dim_out * mlp_ratio), + out_features=dim_out, + act_layer=act_layer, + ) + + if dim != dim_out: + self.proj = nn.Linear(dim, dim_out) + + if stride_q > 1: + kernel_skip = stride_q + 1 + padding_skip = int(kernel_skip // 2) + self.pool_skip = nn.MaxPool2d(kernel_skip, stride_q, padding_skip, ceil_mode=False) + + def forward(self, x): + x_norm = self.norm1(x) + x_block = self.attn(x_norm) + + if hasattr(self, "proj"): + x = self.proj(x_norm) + if hasattr(self, "pool_skip"): + x = attention_pool(x, self.pool_skip) + + x = x + self.drop_path(x_block) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class MViT(Backbone): + """ + This module implements Multiscale Vision Transformer (MViT) backbone in :paper:'mvitv2'. + """ + + def __init__( + self, + img_size=224, + patch_kernel=(7, 7), + patch_stride=(4, 4), + patch_padding=(3, 3), + in_chans=3, + embed_dim=96, + depth=16, + num_heads=1, + last_block_indexes=(0, 2, 11, 15), + qkv_pool_kernel=(3, 3), + adaptive_kv_stride=4, + adaptive_window_size=56, + residual_pooling=True, + mlp_ratio=4.0, + qkv_bias=True, + drop_path_rate=0.0, + norm_layer=nn.LayerNorm, + act_layer=nn.GELU, + use_abs_pos=False, + use_rel_pos=True, + rel_pos_zero_init=True, + use_act_checkpoint=False, + pretrain_img_size=224, + pretrain_use_cls_token=True, + out_features=("scale2", "scale3", "scale4", "scale5"), + ): + """ + Args: + img_size (int): Input image size. + patch_kernel (tuple): kernel size for patch embedding. + patch_stride (tuple): stride size for patch embedding. + patch_padding (tuple): padding size for patch embedding. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + depth (int): Depth of MViT. + num_heads (int): Number of base attention heads in each MViT block. + last_block_indexes (tuple): Block indexes for last blocks in each stage. + qkv_pool_kernel (tuple): kernel size for qkv pooling layers. + adaptive_kv_stride (int): adaptive stride size for kv pooling. + adaptive_window_size (int): adaptive window size for window attention blocks. + residual_pooling (bool): If true, enable residual pooling. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + drop_path_rate (float): Stochastic depth rate. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_abs_pos (bool): If True, use absolute positional embeddings. + use_rel_pos (bool): If True, add relative postional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. + use_act_checkpoint (bool): If True, use activation checkpointing. + pretrain_img_size (int): input image size for pretraining models. + pretrain_use_cls_token (bool): If True, pretrainig models use class token. + out_features (tuple): name of the feature maps from each stage. + """ + super().__init__() + self.pretrain_use_cls_token = pretrain_use_cls_token + + self.patch_embed = PatchEmbed( + kernel_size=patch_kernel, + stride=patch_stride, + padding=patch_padding, + in_chans=in_chans, + embed_dim=embed_dim, + ) + + if use_abs_pos: + # Initialize absoluate positional embedding with pretrain image size. + num_patches = (pretrain_img_size // patch_stride[0]) * ( + pretrain_img_size // patch_stride[1] + ) + num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches + self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim)) + else: + self.pos_embed = None + + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + dim_out = embed_dim + stride_kv = adaptive_kv_stride + window_size = adaptive_window_size + input_size = (img_size // patch_stride[0], img_size // patch_stride[1]) + stage = 2 + stride = patch_stride[0] + self._out_feature_strides = {} + self._out_feature_channels = {} + self.blocks = nn.ModuleList() + for i in range(depth): + # Multiply stride_kv by 2 if it's the last block of stage2 and stage3. + if i == last_block_indexes[1] or i == last_block_indexes[2]: + stride_kv_ = stride_kv * 2 + else: + stride_kv_ = stride_kv + # hybrid window attention: global attention in last three stages. + window_size_ = 0 if i in last_block_indexes[1:] else window_size + block = MultiScaleBlock( + dim=embed_dim, + dim_out=dim_out, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + qkv_pool_kernel=qkv_pool_kernel, + stride_q=2 if i - 1 in last_block_indexes else 1, + stride_kv=stride_kv_, + residual_pooling=residual_pooling, + window_size=window_size_, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size, + ) + if use_act_checkpoint: + block = checkpoint_wrapper(block) + self.blocks.append(block) + + embed_dim = dim_out + if i in last_block_indexes: + name = f"scale{stage}" + if name in out_features: + self._out_feature_channels[name] = dim_out + self._out_feature_strides[name] = stride + self.add_module(f"{name}_norm", norm_layer(dim_out)) + + dim_out *= 2 + num_heads *= 2 + stride_kv = max(stride_kv // 2, 1) + stride *= 2 + stage += 1 + if i - 1 in last_block_indexes: + window_size = window_size // 2 + input_size = [s // 2 for s in input_size] + + self._out_features = out_features + self._last_block_indexes = last_block_indexes + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=0.02) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x): + x = self.patch_embed(x) + + if self.pos_embed is not None: + x = x + get_abs_pos(self.pos_embed, self.pretrain_use_cls_token, x.shape[1:3]) + + outputs = {} + stage = 2 + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in self._last_block_indexes: + name = f"scale{stage}" + if name in self._out_features: + x_out = getattr(self, f"{name}_norm")(x) + outputs[name] = x_out.permute(0, 3, 1, 2) + stage += 1 + + return outputs diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/regnet.py b/EVA/EVA-02/det/detectron2/modeling/backbone/regnet.py new file mode 100644 index 00000000..3533d633 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/regnet.py @@ -0,0 +1,452 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Implementation of RegNet models from :paper:`dds` and :paper:`scaling`. + +This code is adapted from https://github.com/facebookresearch/pycls with minimal modifications. +Some code duplication exists between RegNet and ResNets (e.g., ResStem) in order to simplify +model loading. +""" + +import numpy as np +from torch import nn + +from detectron2.layers import CNNBlockBase, ShapeSpec, get_norm + +from .backbone import Backbone + +__all__ = [ + "AnyNet", + "RegNet", + "ResStem", + "SimpleStem", + "VanillaBlock", + "ResBasicBlock", + "ResBottleneckBlock", +] + + +def conv2d(w_in, w_out, k, *, stride=1, groups=1, bias=False): + """Helper for building a conv2d layer.""" + assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues." + s, p, g, b = stride, (k - 1) // 2, groups, bias + return nn.Conv2d(w_in, w_out, k, stride=s, padding=p, groups=g, bias=b) + + +def gap2d(): + """Helper for building a global average pooling layer.""" + return nn.AdaptiveAvgPool2d((1, 1)) + + +def pool2d(k, *, stride=1): + """Helper for building a pool2d layer.""" + assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues." + return nn.MaxPool2d(k, stride=stride, padding=(k - 1) // 2) + + +def init_weights(m): + """Performs ResNet-style weight initialization.""" + if isinstance(m, nn.Conv2d): + # Note that there is no bias due to BN + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(mean=0.0, std=np.sqrt(2.0 / fan_out)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1.0) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + m.weight.data.normal_(mean=0.0, std=0.01) + m.bias.data.zero_() + + +class ResStem(CNNBlockBase): + """ResNet stem for ImageNet: 7x7, BN, AF, MaxPool.""" + + def __init__(self, w_in, w_out, norm, activation_class): + super().__init__(w_in, w_out, 4) + self.conv = conv2d(w_in, w_out, 7, stride=2) + self.bn = get_norm(norm, w_out) + self.af = activation_class() + self.pool = pool2d(3, stride=2) + + def forward(self, x): + for layer in self.children(): + x = layer(x) + return x + + +class SimpleStem(CNNBlockBase): + """Simple stem for ImageNet: 3x3, BN, AF.""" + + def __init__(self, w_in, w_out, norm, activation_class): + super().__init__(w_in, w_out, 2) + self.conv = conv2d(w_in, w_out, 3, stride=2) + self.bn = get_norm(norm, w_out) + self.af = activation_class() + + def forward(self, x): + for layer in self.children(): + x = layer(x) + return x + + +class SE(nn.Module): + """Squeeze-and-Excitation (SE) block: AvgPool, FC, Act, FC, Sigmoid.""" + + def __init__(self, w_in, w_se, activation_class): + super().__init__() + self.avg_pool = gap2d() + self.f_ex = nn.Sequential( + conv2d(w_in, w_se, 1, bias=True), + activation_class(), + conv2d(w_se, w_in, 1, bias=True), + nn.Sigmoid(), + ) + + def forward(self, x): + return x * self.f_ex(self.avg_pool(x)) + + +class VanillaBlock(CNNBlockBase): + """Vanilla block: [3x3 conv, BN, Relu] x2.""" + + def __init__(self, w_in, w_out, stride, norm, activation_class, _params): + super().__init__(w_in, w_out, stride) + self.a = conv2d(w_in, w_out, 3, stride=stride) + self.a_bn = get_norm(norm, w_out) + self.a_af = activation_class() + self.b = conv2d(w_out, w_out, 3) + self.b_bn = get_norm(norm, w_out) + self.b_af = activation_class() + + def forward(self, x): + for layer in self.children(): + x = layer(x) + return x + + +class BasicTransform(nn.Module): + """Basic transformation: [3x3 conv, BN, Relu] x2.""" + + def __init__(self, w_in, w_out, stride, norm, activation_class, _params): + super().__init__() + self.a = conv2d(w_in, w_out, 3, stride=stride) + self.a_bn = get_norm(norm, w_out) + self.a_af = activation_class() + self.b = conv2d(w_out, w_out, 3) + self.b_bn = get_norm(norm, w_out) + self.b_bn.final_bn = True + + def forward(self, x): + for layer in self.children(): + x = layer(x) + return x + + +class ResBasicBlock(CNNBlockBase): + """Residual basic block: x + f(x), f = basic transform.""" + + def __init__(self, w_in, w_out, stride, norm, activation_class, params): + super().__init__(w_in, w_out, stride) + self.proj, self.bn = None, None + if (w_in != w_out) or (stride != 1): + self.proj = conv2d(w_in, w_out, 1, stride=stride) + self.bn = get_norm(norm, w_out) + self.f = BasicTransform(w_in, w_out, stride, norm, activation_class, params) + self.af = activation_class() + + def forward(self, x): + x_p = self.bn(self.proj(x)) if self.proj else x + return self.af(x_p + self.f(x)) + + +class BottleneckTransform(nn.Module): + """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1.""" + + def __init__(self, w_in, w_out, stride, norm, activation_class, params): + super().__init__() + w_b = int(round(w_out * params["bot_mul"])) + w_se = int(round(w_in * params["se_r"])) + groups = w_b // params["group_w"] + self.a = conv2d(w_in, w_b, 1) + self.a_bn = get_norm(norm, w_b) + self.a_af = activation_class() + self.b = conv2d(w_b, w_b, 3, stride=stride, groups=groups) + self.b_bn = get_norm(norm, w_b) + self.b_af = activation_class() + self.se = SE(w_b, w_se, activation_class) if w_se else None + self.c = conv2d(w_b, w_out, 1) + self.c_bn = get_norm(norm, w_out) + self.c_bn.final_bn = True + + def forward(self, x): + for layer in self.children(): + x = layer(x) + return x + + +class ResBottleneckBlock(CNNBlockBase): + """Residual bottleneck block: x + f(x), f = bottleneck transform.""" + + def __init__(self, w_in, w_out, stride, norm, activation_class, params): + super().__init__(w_in, w_out, stride) + self.proj, self.bn = None, None + if (w_in != w_out) or (stride != 1): + self.proj = conv2d(w_in, w_out, 1, stride=stride) + self.bn = get_norm(norm, w_out) + self.f = BottleneckTransform(w_in, w_out, stride, norm, activation_class, params) + self.af = activation_class() + + def forward(self, x): + x_p = self.bn(self.proj(x)) if self.proj else x + return self.af(x_p + self.f(x)) + + +class AnyStage(nn.Module): + """AnyNet stage (sequence of blocks w/ the same output shape).""" + + def __init__(self, w_in, w_out, stride, d, block_class, norm, activation_class, params): + super().__init__() + for i in range(d): + block = block_class(w_in, w_out, stride, norm, activation_class, params) + self.add_module("b{}".format(i + 1), block) + stride, w_in = 1, w_out + + def forward(self, x): + for block in self.children(): + x = block(x) + return x + + +class AnyNet(Backbone): + """AnyNet model. See :paper:`dds`.""" + + def __init__( + self, + *, + stem_class, + stem_width, + block_class, + depths, + widths, + group_widths, + strides, + bottleneck_ratios, + se_ratio, + activation_class, + freeze_at=0, + norm="BN", + out_features=None, + ): + """ + Args: + stem_class (callable): A callable taking 4 arguments (channels in, channels out, + normalization, callable returning an activation function) that returns another + callable implementing the stem module. + stem_width (int): The number of output channels that the stem produces. + block_class (callable): A callable taking 6 arguments (channels in, channels out, + stride, normalization, callable returning an activation function, a dict of + block-specific parameters) that returns another callable implementing the repeated + block module. + depths (list[int]): Number of blocks in each stage. + widths (list[int]): For each stage, the number of output channels of each block. + group_widths (list[int]): For each stage, the number of channels per group in group + convolution, if the block uses group convolution. + strides (list[int]): The stride that each network stage applies to its input. + bottleneck_ratios (list[float]): For each stage, the ratio of the number of bottleneck + channels to the number of block input channels (or, equivalently, output channels), + if the block uses a bottleneck. + se_ratio (float): The ratio of the number of channels used inside the squeeze-excitation + (SE) module to it number of input channels, if SE the block uses SE. + activation_class (callable): A callable taking no arguments that returns another + callable implementing an activation function. + freeze_at (int): The number of stages at the beginning to freeze. + see :meth:`freeze` for detailed explanation. + norm (str or callable): normalization for all conv layers. + See :func:`layers.get_norm` for supported format. + out_features (list[str]): name of the layers whose outputs should + be returned in forward. RegNet's use "stem" and "s1", "s2", etc for the stages after + the stem. If None, will return the output of the last layer. + """ + super().__init__() + self.stem = stem_class(3, stem_width, norm, activation_class) + + current_stride = self.stem.stride + self._out_feature_strides = {"stem": current_stride} + self._out_feature_channels = {"stem": self.stem.out_channels} + self.stages_and_names = [] + prev_w = stem_width + + for i, (d, w, s, b, g) in enumerate( + zip(depths, widths, strides, bottleneck_ratios, group_widths) + ): + params = {"bot_mul": b, "group_w": g, "se_r": se_ratio} + stage = AnyStage(prev_w, w, s, d, block_class, norm, activation_class, params) + name = "s{}".format(i + 1) + self.add_module(name, stage) + self.stages_and_names.append((stage, name)) + self._out_feature_strides[name] = current_stride = int( + current_stride * np.prod([k.stride for k in stage.children()]) + ) + self._out_feature_channels[name] = list(stage.children())[-1].out_channels + prev_w = w + + self.apply(init_weights) + + if out_features is None: + out_features = [name] + self._out_features = out_features + assert len(self._out_features) + children = [x[0] for x in self.named_children()] + for out_feature in self._out_features: + assert out_feature in children, "Available children: {} does not include {}".format( + ", ".join(children), out_feature + ) + self.freeze(freeze_at) + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + + Returns: + dict[str->Tensor]: names and the corresponding features + """ + assert x.dim() == 4, f"Model takes an input of shape (N, C, H, W). Got {x.shape} instead!" + outputs = {} + x = self.stem(x) + if "stem" in self._out_features: + outputs["stem"] = x + for stage, name in self.stages_and_names: + x = stage(x) + if name in self._out_features: + outputs[name] = x + return outputs + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } + + def freeze(self, freeze_at=0): + """ + Freeze the first several stages of the model. Commonly used in fine-tuning. + + Layers that produce the same feature map spatial size are defined as one + "stage" by :paper:`FPN`. + + Args: + freeze_at (int): number of stages to freeze. + `1` means freezing the stem. `2` means freezing the stem and + one residual stage, etc. + + Returns: + nn.Module: this model itself + """ + if freeze_at >= 1: + self.stem.freeze() + for idx, (stage, _) in enumerate(self.stages_and_names, start=2): + if freeze_at >= idx: + for block in stage.children(): + block.freeze() + return self + + +def adjust_block_compatibility(ws, bs, gs): + """Adjusts the compatibility of widths, bottlenecks, and groups.""" + assert len(ws) == len(bs) == len(gs) + assert all(w > 0 and b > 0 and g > 0 for w, b, g in zip(ws, bs, gs)) + vs = [int(max(1, w * b)) for w, b in zip(ws, bs)] + gs = [int(min(g, v)) for g, v in zip(gs, vs)] + ms = [np.lcm(g, b) if b > 1 else g for g, b in zip(gs, bs)] + vs = [max(m, int(round(v / m) * m)) for v, m in zip(vs, ms)] + ws = [int(v / b) for v, b in zip(vs, bs)] + assert all(w * b % g == 0 for w, b, g in zip(ws, bs, gs)) + return ws, bs, gs + + +def generate_regnet_parameters(w_a, w_0, w_m, d, q=8): + """Generates per stage widths and depths from RegNet parameters.""" + assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0 + # Generate continuous per-block ws + ws_cont = np.arange(d) * w_a + w_0 + # Generate quantized per-block ws + ks = np.round(np.log(ws_cont / w_0) / np.log(w_m)) + ws_all = w_0 * np.power(w_m, ks) + ws_all = np.round(np.divide(ws_all, q)).astype(int) * q + # Generate per stage ws and ds (assumes ws_all are sorted) + ws, ds = np.unique(ws_all, return_counts=True) + # Compute number of actual stages and total possible stages + num_stages, total_stages = len(ws), ks.max() + 1 + # Convert numpy arrays to lists and return + ws, ds, ws_all, ws_cont = (x.tolist() for x in (ws, ds, ws_all, ws_cont)) + return ws, ds, num_stages, total_stages, ws_all, ws_cont + + +class RegNet(AnyNet): + """RegNet model. See :paper:`dds`.""" + + def __init__( + self, + *, + stem_class, + stem_width, + block_class, + depth, + w_a, + w_0, + w_m, + group_width, + stride=2, + bottleneck_ratio=1.0, + se_ratio=0.0, + activation_class=None, + freeze_at=0, + norm="BN", + out_features=None, + ): + """ + Build a RegNet from the parameterization described in :paper:`dds` Section 3.3. + + Args: + See :class:`AnyNet` for arguments that are not listed here. + depth (int): Total number of blocks in the RegNet. + w_a (float): Factor by which block width would increase prior to quantizing block widths + by stage. See :paper:`dds` Section 3.3. + w_0 (int): Initial block width. See :paper:`dds` Section 3.3. + w_m (float): Parameter controlling block width quantization. + See :paper:`dds` Section 3.3. + group_width (int): Number of channels per group in group convolution, if the block uses + group convolution. + bottleneck_ratio (float): The ratio of the number of bottleneck channels to the number + of block input channels (or, equivalently, output channels), if the block uses a + bottleneck. + stride (int): The stride that each network stage applies to its input. + """ + ws, ds = generate_regnet_parameters(w_a, w_0, w_m, depth)[0:2] + ss = [stride for _ in ws] + bs = [bottleneck_ratio for _ in ws] + gs = [group_width for _ in ws] + ws, bs, gs = adjust_block_compatibility(ws, bs, gs) + + def default_activation_class(): + return nn.ReLU(inplace=True) + + super().__init__( + stem_class=stem_class, + stem_width=stem_width, + block_class=block_class, + depths=ds, + widths=ws, + strides=ss, + group_widths=gs, + bottleneck_ratios=bs, + se_ratio=se_ratio, + activation_class=default_activation_class + if activation_class is None + else activation_class, + freeze_at=freeze_at, + norm=norm, + out_features=out_features, + ) diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/resnet.py b/EVA/EVA-02/det/detectron2/modeling/backbone/resnet.py new file mode 100644 index 00000000..5b8e842c --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/resnet.py @@ -0,0 +1,694 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn.functional as F +from torch import nn + +from detectron2.layers import ( + CNNBlockBase, + Conv2d, + DeformConv, + ModulatedDeformConv, + ShapeSpec, + get_norm, +) + +from .backbone import Backbone +from .build import BACKBONE_REGISTRY + +__all__ = [ + "ResNetBlockBase", + "BasicBlock", + "BottleneckBlock", + "DeformBottleneckBlock", + "BasicStem", + "ResNet", + "make_stage", + "build_resnet_backbone", +] + + +class BasicBlock(CNNBlockBase): + """ + The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`, + with two 3x3 conv layers and a projection shortcut if needed. + """ + + def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"): + """ + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (int): Stride for the first conv. + norm (str or callable): normalization for all conv layers. + See :func:`layers.get_norm` for supported format. + """ + super().__init__(in_channels, out_channels, stride) + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels), + ) + else: + self.shortcut = None + + self.conv1 = Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + self.conv2 = Conv2d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + for layer in [self.conv1, self.conv2, self.shortcut]: + if layer is not None: # shortcut can be None + weight_init.c2_msra_fill(layer) + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + out = self.conv2(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +class BottleneckBlock(CNNBlockBase): + """ + The standard bottleneck residual block used by ResNet-50, 101 and 152 + defined in :paper:`ResNet`. It contains 3 conv layers with kernels + 1x1, 3x3, 1x1, and a projection shortcut if needed. + """ + + def __init__( + self, + in_channels, + out_channels, + *, + bottleneck_channels, + stride=1, + num_groups=1, + norm="BN", + stride_in_1x1=False, + dilation=1, + ): + """ + Args: + bottleneck_channels (int): number of output channels for the 3x3 + "bottleneck" conv layers. + num_groups (int): number of groups for the 3x3 conv layer. + norm (str or callable): normalization for all conv layers. + See :func:`layers.get_norm` for supported format. + stride_in_1x1 (bool): when stride>1, whether to put stride in the + first 1x1 convolution or the bottleneck 3x3 convolution. + dilation (int): the dilation rate of the 3x3 conv layer. + """ + super().__init__(in_channels, out_channels, stride) + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels), + ) + else: + self.shortcut = None + + # The original MSRA ResNet models have stride in the first 1x1 conv + # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have + # stride in the 3x3 conv + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + norm=get_norm(norm, bottleneck_channels), + ) + + self.conv2 = Conv2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + padding=1 * dilation, + bias=False, + groups=num_groups, + dilation=dilation, + norm=get_norm(norm, bottleneck_channels), + ) + + self.conv3 = Conv2d( + bottleneck_channels, + out_channels, + kernel_size=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: + if layer is not None: # shortcut can be None + weight_init.c2_msra_fill(layer) + + # Zero-initialize the last normalization in each residual branch, + # so that at the beginning, the residual branch starts with zeros, + # and each residual block behaves like an identity. + # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": + # "For BN layers, the learnable scaling coefficient γ is initialized + # to be 1, except for each residual block's last BN + # where γ is initialized to be 0." + + # nn.init.constant_(self.conv3.norm.weight, 0) + # TODO this somehow hurts performance when training GN models from scratch. + # Add it as an option when we need to use this code to train a backbone. + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + + out = self.conv2(out) + out = F.relu_(out) + + out = self.conv3(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +class DeformBottleneckBlock(CNNBlockBase): + """ + Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv ` + in the 3x3 convolution. + """ + + def __init__( + self, + in_channels, + out_channels, + *, + bottleneck_channels, + stride=1, + num_groups=1, + norm="BN", + stride_in_1x1=False, + dilation=1, + deform_modulated=False, + deform_num_groups=1, + ): + super().__init__(in_channels, out_channels, stride) + self.deform_modulated = deform_modulated + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels), + ) + else: + self.shortcut = None + + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + norm=get_norm(norm, bottleneck_channels), + ) + + if deform_modulated: + deform_conv_op = ModulatedDeformConv + # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size + offset_channels = 27 + else: + deform_conv_op = DeformConv + offset_channels = 18 + + self.conv2_offset = Conv2d( + bottleneck_channels, + offset_channels * deform_num_groups, + kernel_size=3, + stride=stride_3x3, + padding=1 * dilation, + dilation=dilation, + ) + self.conv2 = deform_conv_op( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + padding=1 * dilation, + bias=False, + groups=num_groups, + dilation=dilation, + deformable_groups=deform_num_groups, + norm=get_norm(norm, bottleneck_channels), + ) + + self.conv3 = Conv2d( + bottleneck_channels, + out_channels, + kernel_size=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: + if layer is not None: # shortcut can be None + weight_init.c2_msra_fill(layer) + + nn.init.constant_(self.conv2_offset.weight, 0) + nn.init.constant_(self.conv2_offset.bias, 0) + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + + if self.deform_modulated: + offset_mask = self.conv2_offset(out) + offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1) + offset = torch.cat((offset_x, offset_y), dim=1) + mask = mask.sigmoid() + out = self.conv2(out, offset, mask) + else: + offset = self.conv2_offset(out) + out = self.conv2(out, offset) + out = F.relu_(out) + + out = self.conv3(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +class BasicStem(CNNBlockBase): + """ + The standard ResNet stem (layers before the first residual block), + with a conv, relu and max_pool. + """ + + def __init__(self, in_channels=3, out_channels=64, norm="BN"): + """ + Args: + norm (str or callable): norm after the first conv layer. + See :func:`layers.get_norm` for supported format. + """ + super().__init__(in_channels, out_channels, 4) + self.in_channels = in_channels + self.conv1 = Conv2d( + in_channels, + out_channels, + kernel_size=7, + stride=2, + padding=3, + bias=False, + norm=get_norm(norm, out_channels), + ) + weight_init.c2_msra_fill(self.conv1) + + def forward(self, x): + x = self.conv1(x) + x = F.relu_(x) + x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) + return x + + +class ResNet(Backbone): + """ + Implement :paper:`ResNet`. + """ + + def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0): + """ + Args: + stem (nn.Module): a stem module + stages (list[list[CNNBlockBase]]): several (typically 4) stages, + each contains multiple :class:`CNNBlockBase`. + num_classes (None or int): if None, will not perform classification. + Otherwise, will create a linear layer. + out_features (list[str]): name of the layers whose outputs should + be returned in forward. Can be anything in "stem", "linear", or "res2" ... + If None, will return the output of the last layer. + freeze_at (int): The number of stages at the beginning to freeze. + see :meth:`freeze` for detailed explanation. + """ + super().__init__() + self.stem = stem + self.num_classes = num_classes + + current_stride = self.stem.stride + self._out_feature_strides = {"stem": current_stride} + self._out_feature_channels = {"stem": self.stem.out_channels} + + self.stage_names, self.stages = [], [] + + if out_features is not None: + # Avoid keeping unused layers in this module. They consume extra memory + # and may cause allreduce to fail + num_stages = max( + [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features] + ) + stages = stages[:num_stages] + for i, blocks in enumerate(stages): + assert len(blocks) > 0, len(blocks) + for block in blocks: + assert isinstance(block, CNNBlockBase), block + + name = "res" + str(i + 2) + stage = nn.Sequential(*blocks) + + self.add_module(name, stage) + self.stage_names.append(name) + self.stages.append(stage) + + self._out_feature_strides[name] = current_stride = int( + current_stride * np.prod([k.stride for k in blocks]) + ) + self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels + self.stage_names = tuple(self.stage_names) # Make it static for scripting + + if num_classes is not None: + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.linear = nn.Linear(curr_channels, num_classes) + + # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": + # "The 1000-way fully-connected layer is initialized by + # drawing weights from a zero-mean Gaussian with standard deviation of 0.01." + nn.init.normal_(self.linear.weight, std=0.01) + name = "linear" + + if out_features is None: + out_features = [name] + self._out_features = out_features + assert len(self._out_features) + children = [x[0] for x in self.named_children()] + for out_feature in self._out_features: + assert out_feature in children, "Available children: {}".format(", ".join(children)) + self.freeze(freeze_at) + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + + Returns: + dict[str->Tensor]: names and the corresponding features + """ + assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!" + outputs = {} + x = self.stem(x) + if "stem" in self._out_features: + outputs["stem"] = x + for name, stage in zip(self.stage_names, self.stages): + x = stage(x) + if name in self._out_features: + outputs[name] = x + if self.num_classes is not None: + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.linear(x) + if "linear" in self._out_features: + outputs["linear"] = x + return outputs + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } + + def freeze(self, freeze_at=0): + """ + Freeze the first several stages of the ResNet. Commonly used in + fine-tuning. + + Layers that produce the same feature map spatial size are defined as one + "stage" by :paper:`FPN`. + + Args: + freeze_at (int): number of stages to freeze. + `1` means freezing the stem. `2` means freezing the stem and + one residual stage, etc. + + Returns: + nn.Module: this ResNet itself + """ + if freeze_at >= 1: + self.stem.freeze() + for idx, stage in enumerate(self.stages, start=2): + if freeze_at >= idx: + for block in stage.children(): + block.freeze() + return self + + @staticmethod + def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs): + """ + Create a list of blocks of the same type that forms one ResNet stage. + + Args: + block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this + stage. A module of this type must not change spatial resolution of inputs unless its + stride != 1. + num_blocks (int): number of blocks in this stage + in_channels (int): input channels of the entire stage. + out_channels (int): output channels of **every block** in the stage. + kwargs: other arguments passed to the constructor of + `block_class`. If the argument name is "xx_per_block", the + argument is a list of values to be passed to each block in the + stage. Otherwise, the same argument is passed to every block + in the stage. + + Returns: + list[CNNBlockBase]: a list of block module. + + Examples: + :: + stage = ResNet.make_stage( + BottleneckBlock, 3, in_channels=16, out_channels=64, + bottleneck_channels=16, num_groups=1, + stride_per_block=[2, 1, 1], + dilations_per_block=[1, 1, 2] + ) + + Usually, layers that produce the same feature map spatial size are defined as one + "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should + all be 1. + """ + blocks = [] + for i in range(num_blocks): + curr_kwargs = {} + for k, v in kwargs.items(): + if k.endswith("_per_block"): + assert len(v) == num_blocks, ( + f"Argument '{k}' of make_stage should have the " + f"same length as num_blocks={num_blocks}." + ) + newk = k[: -len("_per_block")] + assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!" + curr_kwargs[newk] = v[i] + else: + curr_kwargs[k] = v + + blocks.append( + block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs) + ) + in_channels = out_channels + return blocks + + @staticmethod + def make_default_stages(depth, block_class=None, **kwargs): + """ + Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152). + If it doesn't create the ResNet variant you need, please use :meth:`make_stage` + instead for fine-grained customization. + + Args: + depth (int): depth of ResNet + block_class (type): the CNN block class. Has to accept + `bottleneck_channels` argument for depth > 50. + By default it is BasicBlock or BottleneckBlock, based on the + depth. + kwargs: + other arguments to pass to `make_stage`. Should not contain + stride and channels, as they are predefined for each depth. + + Returns: + list[list[CNNBlockBase]]: modules in all stages; see arguments of + :class:`ResNet.__init__`. + """ + num_blocks_per_stage = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3], + }[depth] + if block_class is None: + block_class = BasicBlock if depth < 50 else BottleneckBlock + if depth < 50: + in_channels = [64, 64, 128, 256] + out_channels = [64, 128, 256, 512] + else: + in_channels = [64, 256, 512, 1024] + out_channels = [256, 512, 1024, 2048] + ret = [] + for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels): + if depth >= 50: + kwargs["bottleneck_channels"] = o // 4 + ret.append( + ResNet.make_stage( + block_class=block_class, + num_blocks=n, + stride_per_block=[s] + [1] * (n - 1), + in_channels=i, + out_channels=o, + **kwargs, + ) + ) + return ret + + +ResNetBlockBase = CNNBlockBase +""" +Alias for backward compatibiltiy. +""" + + +def make_stage(*args, **kwargs): + """ + Deprecated alias for backward compatibiltiy. + """ + return ResNet.make_stage(*args, **kwargs) + + +@BACKBONE_REGISTRY.register() +def build_resnet_backbone(cfg, input_shape): + """ + Create a ResNet instance from config. + + Returns: + ResNet: a :class:`ResNet` instance. + """ + # need registration of new blocks/stems? + norm = cfg.MODEL.RESNETS.NORM + stem = BasicStem( + in_channels=input_shape.channels, + out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, + norm=norm, + ) + + # fmt: off + freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT + out_features = cfg.MODEL.RESNETS.OUT_FEATURES + depth = cfg.MODEL.RESNETS.DEPTH + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + bottleneck_channels = num_groups * width_per_group + in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 + res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION + deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE + deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED + deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS + # fmt: on + assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) + + num_blocks_per_stage = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3], + }[depth] + + if depth in [18, 34]: + assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34" + assert not any( + deform_on_per_stage + ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34" + assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34" + assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34" + + stages = [] + + for idx, stage_idx in enumerate(range(2, 6)): + # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper + dilation = res5_dilation if stage_idx == 5 else 1 + first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 + stage_kargs = { + "num_blocks": num_blocks_per_stage[idx], + "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1), + "in_channels": in_channels, + "out_channels": out_channels, + "norm": norm, + } + # Use BasicBlock for R18 and R34. + if depth in [18, 34]: + stage_kargs["block_class"] = BasicBlock + else: + stage_kargs["bottleneck_channels"] = bottleneck_channels + stage_kargs["stride_in_1x1"] = stride_in_1x1 + stage_kargs["dilation"] = dilation + stage_kargs["num_groups"] = num_groups + if deform_on_per_stage[idx]: + stage_kargs["block_class"] = DeformBottleneckBlock + stage_kargs["deform_modulated"] = deform_modulated + stage_kargs["deform_num_groups"] = deform_num_groups + else: + stage_kargs["block_class"] = BottleneckBlock + blocks = ResNet.make_stage(**stage_kargs) + in_channels = out_channels + out_channels *= 2 + bottleneck_channels *= 2 + stages.append(blocks) + return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at) diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/swin.py b/EVA/EVA-02/det/detectron2/modeling/backbone/swin.py new file mode 100644 index 00000000..c5986154 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/swin.py @@ -0,0 +1,690 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Implementation of Swin models from :paper:`swin`. + +This code is adapted from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py with minimal modifications. # noqa +-------------------------------------------------------- +Swin Transformer +Copyright (c) 2021 Microsoft +Licensed under The MIT License [see LICENSE for details] +Written by Ze Liu, Yutong Lin, Yixuan Wei +-------------------------------------------------------- +LICENSE: https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/461e003166a8083d0b620beacd4662a2df306bd6/LICENSE +""" + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from detectron2.modeling.backbone.backbone import Backbone + +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + + +class Mlp(nn.Module): + """Multilayer perceptron.""" + + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + """Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. + Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B_, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + """Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop + ) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + """Patch Merging Layer + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.view(B, H, W, C) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. + Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, attn_mask) + else: + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.size() + if W % self.patch_size[1] != 0: + x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) + if H % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + Wh, Ww = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) + + return x + + +class SwinTransformer(Backbone): + """Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted + Windows` - https://arxiv.org/pdf/2103.14030 + Args: + pretrain_img_size (int): Input image size for training the pretrained model, + used in absolute postion embedding. Default 224. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + pretrain_img_size=224, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=(2, 2, 6, 2), + num_heads=(3, 6, 12, 24), + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + use_checkpoint=False, + ): + super().__init__() + + self.pretrain_img_size = pretrain_img_size + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1], + ] + + self.absolute_pos_embed = nn.Parameter( + torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2**i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint, + ) + self.layers.append(layer) + + num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in out_indices: + layer = norm_layer(num_features[i_layer]) + layer_name = f"norm{i_layer}" + self.add_module(layer_name, layer) + + self._freeze_stages() + self._out_features = ["p{}".format(i) for i in self.out_indices] + self._out_feature_channels = { + "p{}".format(i): self.embed_dim * 2**i for i in self.out_indices + } + self._out_feature_strides = {"p{}".format(i): 2 ** (i + 2) for i in self.out_indices} + self._size_devisibility = 32 + + self.apply(self._init_weights) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @property + def size_divisibility(self): + return self._size_divisibility + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = {} + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + outs["p{}".format(i)] = out + + return outs diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/utils.py b/EVA/EVA-02/det/detectron2/modeling/backbone/utils.py new file mode 100644 index 00000000..49da0b7d --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/utils.py @@ -0,0 +1,492 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +import numpy as np +from scipy import interpolate +import torch +import torch.nn as nn +import torch.nn.functional as F + +__all__ = [ + "window_partition", + "window_unpartition", + "add_decomposed_rel_pos", + "get_abs_pos", + "PatchEmbed", + "VisionRotaryEmbeddingFast", +] + + +def window_partition(x, window_size): + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = x.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows, (Hp, Wp) + + +def window_unpartition(windows, window_size, pad_hw, hw): + """ + Window unpartition into original sequences and removing padding. + Args: + x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() + return x + + +def get_rel_pos(q_size, k_size, rel_pos): + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + use_log_interpolation = True + + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + if not use_log_interpolation: + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + src_size = rel_pos.shape[0] + dst_size = max_rel_dist + + # q = 1.13492 + q = 1.0903078 + dis = [] + + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q ** (i + 1) + + r_ids = [-_ for _ in reversed(dis)] + x = r_ids + [0] + dis + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + # print("x = %s" % str(x)) + # print("dx = %s" % str(dx)) + all_rel_pos_bias = [] + for i in range(rel_pos.shape[1]): + z = rel_pos[:, i].view(src_size).cpu().float().numpy() + f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate") + all_rel_pos_bias.append( + torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device)) + rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1) + else: + rel_pos_resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size): + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). + rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. + rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. + q_size (Tuple): spatial sequence size of query q with (q_h, q_w). + k_size (Tuple): spatial sequence size of key k with (k_h, k_w). + + Returns: + attn (Tensor): attention map with added relative positional embeddings. + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) + rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) + + attn = ( + attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] + ).view(B, q_h * q_w, k_h * k_w) + + return attn + + +def get_abs_pos(abs_pos, has_cls_token, hw): + """ + Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token + dimension for the original embeddings. + Args: + abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). + has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. + hw (Tuple): size of input image tokens. + + Returns: + Absolute positional embeddings after processing with shape (1, H, W, C) + """ + h, w = hw + if has_cls_token: + abs_pos = abs_pos[:, 1:] + xy_num = abs_pos.shape[1] + size = int(math.sqrt(xy_num)) + assert size * size == xy_num + + if size != h or size != w: + new_abs_pos = F.interpolate( + abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2), + size=(h, w), + mode="bicubic", + align_corners=False, + ) + + return new_abs_pos.permute(0, 2, 3, 1) + else: + return abs_pos.reshape(1, h, w, -1) + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768 + ): + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): embed_dim (int): Patch embedding dimension. + """ + super().__init__() + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x): + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x + + + + +from math import pi + +import torch +from torch import nn + +from einops import rearrange, repeat + + + +def broadcat(tensors, dim = -1): + num_tensors = len(tensors) + shape_lens = set(list(map(lambda t: len(t.shape), tensors))) + assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions' + shape_len = list(shape_lens)[0] + dim = (dim + shape_len) if dim < 0 else dim + dims = list(zip(*map(lambda t: list(t.shape), tensors))) + expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] + assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation' + max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) + expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) + expanded_dims.insert(dim, (dim, dims[dim])) + expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) + tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes))) + return torch.cat(tensors, dim = dim) + + + +def rotate_half(x): + x = rearrange(x, '... (d r) -> ... d r', r = 2) + x1, x2 = x.unbind(dim = -1) + x = torch.stack((-x2, x1), dim = -1) + return rearrange(x, '... d r -> ... (d r)') + + + +class VisionRotaryEmbedding(nn.Module): + def __init__( + self, + dim, + pt_seq_len, + ft_seq_len=None, + custom_freqs = None, + freqs_for = 'lang', + theta = 10000, + max_freq = 10, + num_freqs = 1, + ): + super().__init__() + if custom_freqs: + freqs = custom_freqs + elif freqs_for == 'lang': + freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) + elif freqs_for == 'pixel': + freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi + elif freqs_for == 'constant': + freqs = torch.ones(num_freqs).float() + else: + raise ValueError(f'unknown modality {freqs_for}') + + if ft_seq_len is None: ft_seq_len = pt_seq_len + t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len + + freqs_h = torch.einsum('..., f -> ... f', t, freqs) + freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2) + + freqs_w = torch.einsum('..., f -> ... f', t, freqs) + freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2) + + freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1) + + self.register_buffer("freqs_cos", freqs.cos()) + self.register_buffer("freqs_sin", freqs.sin()) + + print('======== shape of rope freq', self.freqs_cos.shape, '========') + + def forward(self, t, start_index = 0): + rot_dim = self.freqs_cos.shape[-1] + end_index = start_index + rot_dim + assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}' + t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:] + t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin) + return torch.cat((t_left, t, t_right), dim = -1) + + + + +class VisionRotaryEmbeddingFast(nn.Module): + def __init__( + self, + dim, + pt_seq_len=16, + ft_seq_len=None, + custom_freqs = None, + freqs_for = 'lang', + theta = 10000, + max_freq = 10, + num_freqs = 1, + ): + super().__init__() + if custom_freqs: + freqs = custom_freqs + elif freqs_for == 'lang': + freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) + elif freqs_for == 'pixel': + freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi + elif freqs_for == 'constant': + freqs = torch.ones(num_freqs).float() + else: + raise ValueError(f'unknown modality {freqs_for}') + + if ft_seq_len is None: ft_seq_len = pt_seq_len + t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len + + freqs = torch.einsum('..., f -> ... f', t, freqs) + freqs = repeat(freqs, '... n -> ... (n r)', r = 2) + freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1) + + freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) + freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) + + self.register_buffer("freqs_cos", freqs_cos) + self.register_buffer("freqs_sin", freqs_sin) + + print('======== shape of rope freq', self.freqs_cos.shape, '========') + + def forward(self, t, H, W): + # print(t.shape) + # print(self.freqs_cos.shape) + # print(t.shape, self.freqs_cos.shape, self.freqs_sin.shape) + return t * self.freqs_cos + rotate_half(t) * self.freqs_sin + + +class VisionRotaryEmbeddingFastMS(nn.Module): + def __init__( + self, + dim, + pt_seq_len=16, + ft_seq_len=None, + custom_freqs = None, + freqs_for = 'lang', + theta = 10000, + max_freq = 10, + num_freqs = 1, + ): + super().__init__() + if custom_freqs: + freqs = custom_freqs + elif freqs_for == 'lang': + freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) + elif freqs_for == 'pixel': + freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi + elif freqs_for == 'constant': + freqs = torch.ones(num_freqs).float() + else: + raise ValueError(f'unknown modality {freqs_for}') + + self.ft_seq_len = ft_seq_len + freqs_tmp = freqs + for i, ft_seq_len_sub in enumerate(ft_seq_len): + t = torch.arange(ft_seq_len_sub) / ft_seq_len_sub * pt_seq_len + # print(ft_seq_len_sub, t.shape) + + freqs = torch.einsum('..., f -> ... f', t, freqs_tmp) + freqs = repeat(freqs, '... n -> ... (n r)', r = 2) + freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1) + + freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) + freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) + + self.register_buffer(f"freqs_cos_{i}", freqs_cos) + self.register_buffer(f"freqs_sin_{i}", freqs_sin) + + print('======== shape of rope freq', freqs_cos.shape, '========') + + def forward(self, t, H, W): + for i in range(len(self.ft_seq_len)): + # print(H, self.ft_seq_len[i]) + if H == self.ft_seq_len[i]: + break + freqs_cos = getattr(self, f'freqs_cos_{i}') + freqs_sin = getattr(self, f'freqs_sin_{i}') + return t * freqs_cos + rotate_half(t) * freqs_sin + +def get_rope(t, H, W): + dim=32 + pt_seq_len=16 + custom_freqs = None + freqs_for = 'lang' + theta = 10000 + max_freq = 10 + num_freqs = 1 + + if custom_freqs: + freqs = custom_freqs + elif freqs_for == 'lang': + freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) + elif freqs_for == 'pixel': + freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi + elif freqs_for == 'constant': + freqs = torch.ones(num_freqs).float() + else: + raise ValueError(f'unknown modality {freqs_for}') + + tH = torch.arange(H) / H * pt_seq_len + tW = torch.arange(W) / W * pt_seq_len + + freqsH = torch.einsum('..., f -> ... f', tH, freqs) + freqsH = repeat(freqsH, '... n -> ... (n r)', r = 2) # H, 32 + freqsW = torch.einsum('..., f -> ... f', tW, freqs) + freqsW = repeat(freqsW, '... n -> ... (n r)', r = 2) # W, 32 + freqs = broadcat((freqsH[:, None, :], freqsW[None, :, :]), dim = -1) + freqs = freqs.to(t.device) + freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) + freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) + + return t * freqs_cos + rotate_half(t) * freqs_sin +# class get_rope(nn.Module): +# def __init__( +# self, +# dim, +# pt_seq_len=16, +# ft_seq_len=None, +# custom_freqs = None, +# freqs_for = 'lang', +# theta = 10000, +# max_freq = 10, +# num_freqs = 1, +# ): +# super().__init__() +# self.dim = dim +# self.pt_seq_len = pt_seq_len +# self.custom_freqs = custom_freqs +# self.freqs_for = freqs_for +# self.theta = theta +# self.max_freq = max_freq +# self.num_freqs = num_freqs + +# def forward(self, t, H, W): +# dim = self.dim +# pt_seq_len = self.pt_seq_len +# custom_freqs = self.custom_freqs +# freqs_for = self.freqs_for +# theta = self.theta +# max_freq = self.max_freq +# num_freqs = self.num_freqs + +# if custom_freqs: +# freqs = custom_freqs +# elif freqs_for == 'lang': +# freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) +# elif freqs_for == 'pixel': +# freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi +# elif freqs_for == 'constant': +# freqs = torch.ones(num_freqs).float() +# else: +# raise ValueError(f'unknown modality {freqs_for}') + +# tH = torch.arange(H) / H * pt_seq_len +# tW = torch.arange(W) / W * pt_seq_len + +# freqsH = torch.einsum('..., f -> ... f', tH, freqs) +# freqsH = repeat(freqsH, '... n -> ... (n r)', r = 2) # H, 32 +# freqsW = torch.einsum('..., f -> ... f', tW, freqs) +# freqsW = repeat(freqsW, '... n -> ... (n r)', r = 2) # W, 32 +# freqs = broadcat((freqsH[:, None, :], freqsW[None, :, :]), dim = -1) +# freqs = freqs.to(t.device) +# freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) +# freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) + +# return t * freqs_cos + rotate_half(t) * freqs_sin \ No newline at end of file diff --git a/EVA/EVA-02/det/detectron2/modeling/backbone/vit.py b/EVA/EVA-02/det/detectron2/modeling/backbone/vit.py new file mode 100644 index 00000000..cb423c54 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/backbone/vit.py @@ -0,0 +1,621 @@ +import logging +import math +from functools import partial + +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn as nn +import torch.nn.functional as F + +from detectron2.layers import CNNBlockBase, Conv2d, get_norm +from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous + +from .backbone import Backbone +from .utils import ( + PatchEmbed, + add_decomposed_rel_pos, + get_abs_pos, + window_partition, + window_unpartition, + VisionRotaryEmbeddingFast, + get_rope, + VisionRotaryEmbeddingFastMS, +) + +try: + import xformers.ops as xops +except: + pass + +try: + from apex.normalization import FusedLayerNorm +except: + pass + + +logger = logging.getLogger(__name__) + + + +__all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"] + + + +class SwiGLU(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0., + norm_layer=nn.LayerNorm, subln=False + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.w1 = nn.Linear(in_features, hidden_features) + self.w2 = nn.Linear(in_features, hidden_features) + + self.act = act_layer() + self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() + self.w3 = nn.Linear(hidden_features, out_features) + + self.drop = nn.Dropout(drop) + + def forward(self, x): + x1 = self.w1(x) + x2 = self.w2(x) + hidden = self.act(x1) * x2 + x = self.ffn_ln(hidden) + x = self.w3(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=True, + qk_scale=None, + attn_head_dim=None, + rope=None, + xattn=True, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.q_proj = nn.Linear(dim, all_head_dim, bias=False) + self.k_proj = nn.Linear(dim, all_head_dim, bias=False) + self.v_proj = nn.Linear(dim, all_head_dim, bias=False) + + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + self.rope = rope + self.xattn = xattn + self.proj = nn.Linear(all_head_dim, dim) + + def forward(self, x): + B, H, W, C = x.shape + x = x.view(B, -1, C) + N = H * W + + q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) + k = F.linear(input=x, weight=self.k_proj.weight, bias=None) + v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) + + q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C + k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + + ## rope + q = self.rope(q, H, W).type_as(v) + k = self.rope(k, H, W).type_as(v) + # if self.rope is not None: + # q = self.rope(q).type_as(v) + # k = self.rope(k).type_as(v) + # else: + # q = get_rope(q, H, W).type_as(v) + # k = get_rope(k, H, W).type_as(v) + + if self.xattn: + q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C + k = k.permute(0, 2, 1, 3) + v = v.permute(0, 2, 1, 3) + + x = xops.memory_efficient_attention(q, k, v) + x = x.reshape(B, N, -1) + else: + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + attn = attn.softmax(dim=-1).type_as(x) + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + + x = self.proj(x) + x = x.view(B, H, W, C) + + return x + + +class ResBottleneckBlock(CNNBlockBase): + """ + The standard bottleneck residual block without the last activation layer. + It contains 3 conv layers with kernels 1x1, 3x3, 1x1. + """ + + def __init__( + self, + in_channels, + out_channels, + bottleneck_channels, + norm="LN", + act_layer=nn.GELU, + ): + """ + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + bottleneck_channels (int): number of output channels for the 3x3 + "bottleneck" conv layers. + norm (str or callable): normalization for all conv layers. + See :func:`layers.get_norm` for supported format. + act_layer (callable): activation for all conv layers. + """ + super().__init__(in_channels, out_channels, 1) + + self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False) + self.norm1 = get_norm(norm, bottleneck_channels) + self.act1 = act_layer() + + self.conv2 = Conv2d( + bottleneck_channels, + bottleneck_channels, + 3, + padding=1, + bias=False, + ) + self.norm2 = get_norm(norm, bottleneck_channels) + self.act2 = act_layer() + + self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False) + self.norm3 = get_norm(norm, out_channels) + + for layer in [self.conv1, self.conv2, self.conv3]: + weight_init.c2_msra_fill(layer) + for layer in [self.norm1, self.norm2]: + layer.weight.data.fill_(1.0) + layer.bias.data.zero_() + # zero init last norm layer. + self.norm3.weight.data.zero_() + self.norm3.bias.data.zero_() + + def forward(self, x): + out = x + for layer in self.children(): + out = layer(out) + + out = x + out + return out + + +class Block(nn.Module): + """Transformer blocks with support of window attention and residual propagation blocks""" + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4*2/3, + qkv_bias=True, + drop_path=0.0, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + window_size=0, + use_residual_block=False, + rope=None, + xattn=True, + ): + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + drop_path (float): Stochastic depth rate. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. If it equals 0, then not + use window attention. + use_residual_block (bool): If True, use a residual block after the MLP block. + input_size (int or None): Input resolution for calculating the relative positional + parameter size. + """ + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + rope=rope, + xattn=xattn, + ) + + from timm.models.layers import DropPath + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp = SwiGLU( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + subln=True, + norm_layer=norm_layer, + ) + + self.window_size = window_size + + self.use_residual_block = use_residual_block + if use_residual_block: + # Use a residual block with bottleneck channel as dim // 2 + self.residual = ResBottleneckBlock( + in_channels=dim, + out_channels=dim, + bottleneck_channels=dim // 2, + norm="LN", + ) + + def forward(self, x): + shortcut = x + x = self.norm1(x) + + # Window partition + if self.window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, self.window_size) + + x = self.attn(x) + + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, self.window_size, pad_hw, (H, W)) + + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + if self.use_residual_block: + x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + + return x + + +class ViT(Backbone): + """ + This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. + "Exploring Plain Vision Transformer Backbones for Object Detection", + https://arxiv.org/abs/2203.16527 + """ + + def __init__( + self, + img_size=1024, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4*2/3, + qkv_bias=True, + drop_path_rate=0.0, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, + use_abs_pos=True, + use_rel_pos=False, + rope=True, + pt_hw_seq_len=16, + intp_freq=True, + window_size=0, + window_block_indexes=(), + residual_block_indexes=(), + use_act_checkpoint=False, + pretrain_img_size=224, + pretrain_use_cls_token=True, + out_feature="last_feat", + xattn=True, + use_lsj=False, + ms_img_size=[1024], + ): + """ + Args: + img_size (int): Input image size. + patch_size (int): Patch size. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + depth (int): Depth of ViT. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + drop_path_rate (float): Stochastic depth rate. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_abs_pos (bool): If True, use absolute positional embeddings. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. + window_block_indexes (list): Indexes for blocks using window attention. + residual_block_indexes (list): Indexes for blocks using conv propagation. + use_act_checkpoint (bool): If True, use activation checkpointing. + pretrain_img_size (int): input image size for pretraining models. + pretrain_use_cls_token (bool): If True, pretrainig models use class token. + out_feature (str): name of the feature from the last block. + """ + super().__init__() + self.pretrain_use_cls_token = pretrain_use_cls_token + self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed( + kernel_size=(patch_size, patch_size), + stride=(patch_size, patch_size), + in_chans=in_chans, + embed_dim=embed_dim, + ) + + if use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size) + num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches + self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim)) + else: + self.pos_embed = None + + + half_head_dim = embed_dim // num_heads // 2 + hw_seq_len = img_size // patch_size + + self.rope_win = VisionRotaryEmbeddingFast( + dim=half_head_dim, + pt_seq_len=pt_hw_seq_len, + ft_seq_len=window_size if intp_freq else None, + ) + + if not use_lsj: + self.rope_glb = VisionRotaryEmbeddingFast( + dim=half_head_dim, + pt_seq_len=pt_hw_seq_len, + ft_seq_len=hw_seq_len if intp_freq else None, + ) + else: + print(use_lsj) + # self.rope_glb = None + hw_seq_len = [ms // patch_size for ms in ms_img_size] + print(hw_seq_len) + self.rope_glb = VisionRotaryEmbeddingFastMS( + dim=half_head_dim, + pt_seq_len=pt_hw_seq_len, + ft_seq_len=hw_seq_len if intp_freq else None, + ) + + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + + self.blocks = nn.ModuleList() + for i in range(depth): + block = Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + window_size=window_size if i in window_block_indexes else 0, + use_residual_block=i in residual_block_indexes, + rope=self.rope_win if i in window_block_indexes else self.rope_glb, + xattn=xattn + ) + if use_act_checkpoint: + # TODO: use torch.utils.checkpoint + from fairscale.nn.checkpoint import checkpoint_wrapper + + block = checkpoint_wrapper(block) + self.blocks.append(block) + + self._out_feature_channels = {out_feature: embed_dim} + self._out_feature_strides = {out_feature: patch_size} + self._out_features = [out_feature] + + if self.pos_embed is not None: + nn.init.trunc_normal_(self.pos_embed, std=0.02) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x): + x = self.patch_embed(x) + if self.pos_embed is not None: + x = x + get_abs_pos( + self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2]) + ) + + for blk in self.blocks: + x = blk(x) + + outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)} + return outputs + + +class SimpleFeaturePyramid(Backbone): + """ + This module implements SimpleFeaturePyramid in :paper:`vitdet`. + It creates pyramid features built on top of the input feature map. + """ + + def __init__( + self, + net, + in_feature, + out_channels, + scale_factors, + top_block=None, + norm="LN", + square_pad=0, + ): + """ + Args: + net (Backbone): module representing the subnetwork backbone. + Must be a subclass of :class:`Backbone`. + in_feature (str): names of the input feature maps coming + from the net. + out_channels (int): number of channels in the output feature maps. + scale_factors (list[float]): list of scaling factors to upsample or downsample + the input features for creating pyramid features. + top_block (nn.Module or None): if provided, an extra operation will + be performed on the output of the last (smallest resolution) + pyramid output, and the result will extend the result list. The top_block + further downsamples the feature map. It must have an attribute + "num_levels", meaning the number of extra pyramid levels added by + this block, and "in_feature", which is a string representing + its input feature (e.g., p5). + norm (str): the normalization to use. + square_pad (int): If > 0, require input images to be padded to specific square size. + """ + super(SimpleFeaturePyramid, self).__init__() + assert isinstance(net, Backbone) + + self.scale_factors = scale_factors + + input_shapes = net.output_shape() + strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors] + _assert_strides_are_log2_contiguous(strides) + + dim = input_shapes[in_feature].channels + self.stages = [] + use_bias = norm == "" + for idx, scale in enumerate(scale_factors): + out_dim = dim + if scale == 4.0: + layers = [ + nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2), + get_norm(norm, dim // 2), + nn.GELU(), + nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2), + ] + out_dim = dim // 4 + elif scale == 2.0: + layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)] + out_dim = dim // 2 + elif scale == 1.0: + layers = [] + elif scale == 0.5: + layers = [nn.MaxPool2d(kernel_size=2, stride=2)] + else: + raise NotImplementedError(f"scale_factor={scale} is not supported yet.") + + layers.extend( + [ + Conv2d( + out_dim, + out_channels, + kernel_size=1, + bias=use_bias, + norm=get_norm(norm, out_channels), + ), + Conv2d( + out_channels, + out_channels, + kernel_size=3, + padding=1, + bias=use_bias, + norm=get_norm(norm, out_channels), + ), + ] + ) + layers = nn.Sequential(*layers) + + stage = int(math.log2(strides[idx])) + self.add_module(f"simfp_{stage}", layers) + self.stages.append(layers) + + self.net = net + self.in_feature = in_feature + self.top_block = top_block + # Return feature names are "p", like ["p2", "p3", ..., "p6"] + self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} + # top block output feature maps. + if self.top_block is not None: + for s in range(stage, stage + self.top_block.num_levels): + self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) + + self._out_features = list(self._out_feature_strides.keys()) + self._out_feature_channels = {k: out_channels for k in self._out_features} + self._size_divisibility = strides[-1] + self._square_pad = square_pad + + @property + def padding_constraints(self): + return { + "size_divisiblity": self._size_divisibility, + "square_size": self._square_pad, + } + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + + Returns: + dict[str->Tensor]: + mapping from feature map name to pyramid feature map tensor + in high to low resolution order. Returned feature names follow the FPN + convention: "p", where stage has stride = 2 ** stage e.g., + ["p2", "p3", ..., "p6"]. + """ + bottom_up_features = self.net(x) + features = bottom_up_features[self.in_feature] + results = [] + + for stage in self.stages: + results.append(stage(features)) + + if self.top_block is not None: + if self.top_block.in_feature in bottom_up_features: + top_block_in_feature = bottom_up_features[self.top_block.in_feature] + else: + top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] + results.extend(self.top_block(top_block_in_feature)) + assert len(self._out_features) == len(results) + return {f: res for f, res in zip(self._out_features, results)} + + +def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): + """ + Calculate lr decay rate for different ViT blocks. + Args: + name (string): parameter name. + lr_decay_rate (float): base lr decay rate. + num_layers (int): number of ViT blocks. + + Returns: + lr decay rate for the given parameter. + """ + layer_id = num_layers + 1 + if name.startswith("backbone"): + if ".pos_embed" in name or ".patch_embed" in name: + layer_id = 0 + elif ".blocks." in name and ".residual." not in name: + layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1 + + return lr_decay_rate ** (num_layers + 1 - layer_id) diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/__init__.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/__init__.py new file mode 100644 index 00000000..0f11c693 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/__init__.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. + +from .build import META_ARCH_REGISTRY, build_model # isort:skip + +from .panoptic_fpn import PanopticFPN + +# import all the meta_arch, so they will be registered +from .rcnn import GeneralizedRCNN, ProposalNetwork +from .dense_detector import DenseDetector +from .retinanet import RetinaNet +from .fcos import FCOS +from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head + +from .cb_rcnn import CBGeneralizedRCNN + +__all__ = list(globals().keys()) diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/build.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/build.py new file mode 100644 index 00000000..34272157 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/build.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import torch + +from detectron2.utils.logger import _log_api_usage +from detectron2.utils.registry import Registry + +META_ARCH_REGISTRY = Registry("META_ARCH") # noqa F401 isort:skip +META_ARCH_REGISTRY.__doc__ = """ +Registry for meta-architectures, i.e. the whole model. + +The registered object will be called with `obj(cfg)` +and expected to return a `nn.Module` object. +""" + + +def build_model(cfg): + """ + Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. + Note that it does not load any weights from ``cfg``. + """ + meta_arch = cfg.MODEL.META_ARCHITECTURE + model = META_ARCH_REGISTRY.get(meta_arch)(cfg) + model.to(torch.device(cfg.MODEL.DEVICE)) + _log_api_usage("modeling.meta_arch." + meta_arch) + return model diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/cb_rcnn.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/cb_rcnn.py new file mode 100644 index 00000000..69c1d4d8 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/cb_rcnn.py @@ -0,0 +1,178 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import copy +import logging +import warnings + +import numpy as np +from typing import Dict, List, Optional, Tuple +import torch +from torch import nn + +from detectron2.config import configurable +from detectron2.data.detection_utils import convert_image_to_rgb +from detectron2.layers import move_device_like +from detectron2.structures import ImageList, Instances +from detectron2.utils.events import get_event_storage +from detectron2.utils.logger import log_first_n + +from ..backbone import Backbone, build_backbone +from ..postprocessing import detector_postprocess +from ..proposal_generator import build_proposal_generator +from ..roi_heads import build_roi_heads +from .build import META_ARCH_REGISTRY + +import logging +logger = logging.getLogger(__name__) + +from .rcnn import GeneralizedRCNN + + +@META_ARCH_REGISTRY.register() +class CBGeneralizedRCNN(GeneralizedRCNN): + + def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper` . + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + + * image: Tensor, image in (C, H, W) format. + * instances (optional): groundtruth :class:`Instances` + * proposals (optional): :class:`Instances`, precomputed proposals. + + Other information that's included in the original dicts, such as: + + * "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + + Returns: + list[dict]: + Each dict is the output for one input image. + The dict contains one key "instances" whose value is a :class:`Instances`. + The :class:`Instances` object has the following keys: + "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" + """ + if not self.training: + return self.inference(batched_inputs) + + images = self.preprocess_image(batched_inputs) + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + + features, cb_features = self.backbone(images.tensor) + + torch.cuda.empty_cache() + + losses = {} + loss = self.inner_forward(images, features, gt_instances, batched_inputs) + # losses.update(loss) + for k, v in loss.items(): + new_k = 'first_' + k + losses[new_k] = v * 0.5 + + torch.cuda.empty_cache() + + cb_loss = self.inner_forward(images, cb_features, gt_instances, batched_inputs) + for k, v in cb_loss.items(): + new_k = 'second_' + k + losses[new_k] = v + + torch.cuda.empty_cache() + + return losses + + def inner_forward(self, images, features, gt_instances, batched_inputs): + + for k, v in features.items(): + # feat_isnan = torch.isnan(v).any() + # feat_isinf = torch.isinf(v).any() + # if feat_isnan or feat_isinf: + # logger.info("============ feat_isnan={}, feat_isinf={} ===============".format(feat_isnan, feat_isinf)) + features[k] = v.float() + + with torch.cuda.amp.autocast(enabled=False): + if self.proposal_generator is not None: + proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + proposal_losses = {} + + _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) + if self.vis_period > 0: + storage = get_event_storage() + if storage.iter % self.vis_period == 0: + self.visualize_training(batched_inputs, proposals) + + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + return losses + + def inference( + self, + batched_inputs: List[Dict[str, torch.Tensor]], + detected_instances: Optional[List[Instances]] = None, + do_postprocess: bool = True, + keep_all_before_merge: bool = False, + ): + """ + Run inference on the given inputs. + + Args: + batched_inputs (list[dict]): same as in :meth:`forward` + detected_instances (None or list[Instances]): if not None, it + contains an `Instances` object per image. The `Instances` + object contains "pred_boxes" and "pred_classes" which are + known boxes in the image. + The inference will then skip the detection of bounding boxes, + and only predict other per-ROI outputs. + do_postprocess (bool): whether to apply post-processing on the outputs. + keep_all_before_merge + + Returns: + When do_postprocess=True, same as in :meth:`forward`. + Otherwise, a list[Instances] containing raw network outputs. + """ + assert not self.training + + images = self.preprocess_image(batched_inputs) + _, features = self.backbone(images.tensor) + + if detected_instances is None: + if self.proposal_generator is not None: + proposals, _ = self.proposal_generator(images, features, None) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + + results, _ = self.roi_heads(images, features, proposals, None, keep_all_before_merge=keep_all_before_merge) + else: + detected_instances = [x.to(self.device) for x in detected_instances] + results = self.roi_heads.forward_with_given_boxes(features, detected_instances) + # optionally update score using maskness + if self.roi_heads.maskness_thresh is not None: + for pred_inst in results: + # pred_inst._fields.keys(): dict_keys(['pred_boxes', 'scores', 'pred_classes', 'pred_masks']) + pred_masks = pred_inst.pred_masks # (num_inst, 1, 28, 28) + scores = pred_inst.scores # (num_inst, ) + # sigmoid already applied + binary_masks = pred_masks > self.roi_heads.maskness_thresh + seg_scores = (pred_masks * binary_masks.float()).sum((1, 2, 3)) / binary_masks.sum((1, 2, 3)) + seg_scores[binary_masks.sum((1, 2, 3)) == 0] = 0 # avoid nan + updated_scores = scores * seg_scores + pred_inst.set('scores', updated_scores) + # update order + scores, indices = updated_scores.sort(descending=True) + pred_inst = pred_inst[indices] + assert (pred_inst.scores == scores).all() + + if do_postprocess: + assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." + return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) + return results + + \ No newline at end of file diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/dense_detector.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/dense_detector.py new file mode 100644 index 00000000..9d9b9e8d --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/dense_detector.py @@ -0,0 +1,289 @@ +import numpy as np +from typing import Dict, List, Optional, Tuple +import torch +from torch import Tensor, nn + +from detectron2.data.detection_utils import convert_image_to_rgb +from detectron2.layers import move_device_like +from detectron2.modeling import Backbone +from detectron2.structures import Boxes, ImageList, Instances +from detectron2.utils.events import get_event_storage + +from ..postprocessing import detector_postprocess + + +def permute_to_N_HWA_K(tensor, K: int): + """ + Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K) + """ + assert tensor.dim() == 4, tensor.shape + N, _, H, W = tensor.shape + tensor = tensor.view(N, -1, K, H, W) + tensor = tensor.permute(0, 3, 4, 1, 2) + tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K) + return tensor + + +class DenseDetector(nn.Module): + """ + Base class for dense detector. We define a dense detector as a fully-convolutional model that + makes per-pixel (i.e. dense) predictions. + """ + + def __init__( + self, + backbone: Backbone, + head: nn.Module, + head_in_features: Optional[List[str]] = None, + *, + pixel_mean, + pixel_std, + ): + """ + Args: + backbone: backbone module + head: head module + head_in_features: backbone features to use in head. Default to all backbone features. + pixel_mean (Tuple[float]): + Values to be used for image normalization (BGR order). + To train on images of different number of channels, set different mean & std. + Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675] + pixel_std (Tuple[float]): + When using pre-trained models in Detectron1 or any MSRA models, + std has been absorbed into its conv1 weights, so the std needs to be set 1. + Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std) + """ + super().__init__() + + self.backbone = backbone + self.head = head + if head_in_features is None: + shapes = self.backbone.output_shape() + self.head_in_features = sorted(shapes.keys(), key=lambda x: shapes[x].stride) + else: + self.head_in_features = head_in_features + self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) + + @property + def device(self): + return self.pixel_mean.device + + def _move_to_current_device(self, x): + return move_device_like(x, self.pixel_mean) + + def forward(self, batched_inputs: List[Dict[str, Tensor]]): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper` . + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + + * image: Tensor, image in (C, H, W) format. + * instances: Instances + + Other information that's included in the original dicts, such as: + + * "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + + Returns: + In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the + loss. Used during training only. In inference, the standard output format, described + in :doc:`/tutorials/models`. + """ + images = self.preprocess_image(batched_inputs) + features = self.backbone(images.tensor) + features = [features[f] for f in self.head_in_features] + predictions = self.head(features) + + if self.training: + assert not torch.jit.is_scripting(), "Not supported" + assert "instances" in batched_inputs[0], "Instance annotations are missing in training!" + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + return self.forward_training(images, features, predictions, gt_instances) + else: + results = self.forward_inference(images, features, predictions) + if torch.jit.is_scripting(): + return results + + processed_results = [] + for results_per_image, input_per_image, image_size in zip( + results, batched_inputs, images.image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = detector_postprocess(results_per_image, height, width) + processed_results.append({"instances": r}) + return processed_results + + def forward_training(self, images, features, predictions, gt_instances): + raise NotImplementedError() + + def preprocess_image(self, batched_inputs: List[Dict[str, Tensor]]): + """ + Normalize, pad and batch the input images. + """ + images = [self._move_to_current_device(x["image"]) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors( + images, + self.backbone.size_divisibility, + padding_constraints=self.backbone.padding_constraints, + ) + return images + + def _transpose_dense_predictions( + self, predictions: List[List[Tensor]], dims_per_anchor: List[int] + ) -> List[List[Tensor]]: + """ + Transpose the dense per-level predictions. + + Args: + predictions: a list of outputs, each is a list of per-level + predictions with shape (N, Ai x K, Hi, Wi), where N is the + number of images, Ai is the number of anchors per location on + level i, K is the dimension of predictions per anchor. + dims_per_anchor: the value of K for each predictions. e.g. 4 for + box prediction, #classes for classification prediction. + + Returns: + List[List[Tensor]]: each prediction is transposed to (N, Hi x Wi x Ai, K). + """ + assert len(predictions) == len(dims_per_anchor) + res: List[List[Tensor]] = [] + for pred, dim_per_anchor in zip(predictions, dims_per_anchor): + pred = [permute_to_N_HWA_K(x, dim_per_anchor) for x in pred] + res.append(pred) + return res + + def _ema_update(self, name: str, value: float, initial_value: float, momentum: float = 0.9): + """ + Apply EMA update to `self.name` using `value`. + + This is mainly used for loss normalizer. In Detectron1, loss is normalized by number + of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a + large variance and using it lead to lower performance. Therefore we maintain an EMA of + #foreground to stabilize the normalizer. + + Args: + name: name of the normalizer + value: the new value to update + initial_value: the initial value to start with + momentum: momentum of EMA + + Returns: + float: the updated EMA value + """ + if hasattr(self, name): + old = getattr(self, name) + else: + old = initial_value + new = old * momentum + value * (1 - momentum) + setattr(self, name, new) + return new + + def _decode_per_level_predictions( + self, + anchors: Boxes, + pred_scores: Tensor, + pred_deltas: Tensor, + score_thresh: float, + topk_candidates: int, + image_size: Tuple[int, int], + ) -> Instances: + """ + Decode boxes and classification predictions of one featuer level, by + the following steps: + 1. filter the predictions based on score threshold and top K scores. + 2. transform the box regression outputs + 3. return the predicted scores, classes and boxes + + Args: + anchors: Boxes, anchor for this feature level + pred_scores: HxWxA,K + pred_deltas: HxWxA,4 + + Returns: + Instances: with field "scores", "pred_boxes", "pred_classes". + """ + # Apply two filtering to make NMS faster. + # 1. Keep boxes with confidence score higher than threshold + keep_idxs = pred_scores > score_thresh + pred_scores = pred_scores[keep_idxs] + topk_idxs = torch.nonzero(keep_idxs) # Kx2 + + # 2. Keep top k top scoring boxes only + num_topk = min(topk_candidates, topk_idxs.size(0)) + pred_scores, idxs = pred_scores.topk(num_topk) + topk_idxs = topk_idxs[idxs] + + anchor_idxs, classes_idxs = topk_idxs.unbind(dim=1) + + pred_boxes = self.box2box_transform.apply_deltas( + pred_deltas[anchor_idxs], anchors.tensor[anchor_idxs] + ) + return Instances( + image_size, pred_boxes=Boxes(pred_boxes), scores=pred_scores, pred_classes=classes_idxs + ) + + def _decode_multi_level_predictions( + self, + anchors: List[Boxes], + pred_scores: List[Tensor], + pred_deltas: List[Tensor], + score_thresh: float, + topk_candidates: int, + image_size: Tuple[int, int], + ) -> Instances: + """ + Run `_decode_per_level_predictions` for all feature levels and concat the results. + """ + predictions = [ + self._decode_per_level_predictions( + anchors_i, + box_cls_i, + box_reg_i, + self.test_score_thresh, + self.test_topk_candidates, + image_size, + ) + # Iterate over every feature level + for box_cls_i, box_reg_i, anchors_i in zip(pred_scores, pred_deltas, anchors) + ] + return predictions[0].cat(predictions) # 'Instances.cat' is not scriptale but this is + + def visualize_training(self, batched_inputs, results): + """ + A function used to visualize ground truth images and final network predictions. + It shows ground truth bounding boxes on the original image and up to 20 + predicted object bounding boxes on the original image. + + Args: + batched_inputs (list): a list that contains input to the model. + results (List[Instances]): a list of #images elements returned by forward_inference(). + """ + from detectron2.utils.visualizer import Visualizer + + assert len(batched_inputs) == len( + results + ), "Cannot visualize inputs and results of different sizes" + storage = get_event_storage() + max_boxes = 20 + + image_index = 0 # only visualize a single image + img = batched_inputs[image_index]["image"] + img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) + v_gt = Visualizer(img, None) + v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes) + anno_img = v_gt.get_image() + processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1]) + predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy() + + v_pred = Visualizer(img, None) + v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes]) + prop_img = v_pred.get_image() + vis_img = np.vstack((anno_img, prop_img)) + vis_img = vis_img.transpose(2, 0, 1) + vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results" + storage.put_image(vis_name, vis_img) diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/fcos.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/fcos.py new file mode 100644 index 00000000..7e7140bf --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/fcos.py @@ -0,0 +1,328 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import logging +from typing import List, Optional, Tuple +import torch +from fvcore.nn import sigmoid_focal_loss_jit +from torch import nn +from torch.nn import functional as F + +from detectron2.layers import ShapeSpec, batched_nms +from detectron2.structures import Boxes, ImageList, Instances, pairwise_point_box_distance +from detectron2.utils.events import get_event_storage + +from ..anchor_generator import DefaultAnchorGenerator +from ..backbone import Backbone +from ..box_regression import Box2BoxTransformLinear, _dense_box_regression_loss +from .dense_detector import DenseDetector +from .retinanet import RetinaNetHead + +__all__ = ["FCOS"] + +logger = logging.getLogger(__name__) + + +class FCOS(DenseDetector): + """ + Implement FCOS in :paper:`fcos`. + """ + + def __init__( + self, + *, + backbone: Backbone, + head: nn.Module, + head_in_features: Optional[List[str]] = None, + box2box_transform=None, + num_classes, + center_sampling_radius: float = 1.5, + focal_loss_alpha=0.25, + focal_loss_gamma=2.0, + test_score_thresh=0.2, + test_topk_candidates=1000, + test_nms_thresh=0.6, + max_detections_per_image=100, + pixel_mean, + pixel_std, + ): + """ + Args: + center_sampling_radius: radius of the "center" of a groundtruth box, + within which all anchor points are labeled positive. + Other arguments mean the same as in :class:`RetinaNet`. + """ + super().__init__( + backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std + ) + + self.num_classes = num_classes + + # FCOS uses one anchor point per location. + # We represent the anchor point by a box whose size equals the anchor stride. + feature_shapes = backbone.output_shape() + fpn_strides = [feature_shapes[k].stride for k in self.head_in_features] + self.anchor_generator = DefaultAnchorGenerator( + sizes=[[k] for k in fpn_strides], aspect_ratios=[1.0], strides=fpn_strides + ) + + # FCOS parameterizes box regression by a linear transform, + # where predictions are normalized by anchor stride (equal to anchor size). + if box2box_transform is None: + box2box_transform = Box2BoxTransformLinear(normalize_by_size=True) + self.box2box_transform = box2box_transform + + self.center_sampling_radius = float(center_sampling_radius) + + # Loss parameters: + self.focal_loss_alpha = focal_loss_alpha + self.focal_loss_gamma = focal_loss_gamma + + # Inference parameters: + self.test_score_thresh = test_score_thresh + self.test_topk_candidates = test_topk_candidates + self.test_nms_thresh = test_nms_thresh + self.max_detections_per_image = max_detections_per_image + + def forward_training(self, images, features, predictions, gt_instances): + # Transpose the Hi*Wi*A dimension to the middle: + pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions( + predictions, [self.num_classes, 4, 1] + ) + anchors = self.anchor_generator(features) + gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances) + return self.losses( + anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness + ) + + @torch.no_grad() + def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]): + """ + Match ground-truth boxes to a set of multi-level anchors. + + Args: + gt_boxes: Ground-truth boxes from instances of an image. + anchors: List of anchors for each feature map (of different scales). + + Returns: + torch.Tensor + A tensor of shape `(M, R)`, given `M` ground-truth boxes and total + `R` anchor points from all feature levels, indicating the quality + of match between m-th box and r-th anchor. Higher value indicates + better match. + """ + # Naming convention: (M = ground-truth boxes, R = anchor points) + # Anchor points are represented as square boxes of size = stride. + num_anchors_per_level = [len(x) for x in anchors] + anchors = Boxes.cat(anchors) # (R, 4) + anchor_centers = anchors.get_centers() # (R, 2) + anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0] # (R, ) + + lower_bound = anchor_sizes * 4 + lower_bound[: num_anchors_per_level[0]] = 0 + upper_bound = anchor_sizes * 8 + upper_bound[-num_anchors_per_level[-1] :] = float("inf") + + gt_centers = gt_boxes.get_centers() + + # FCOS with center sampling: anchor point must be close enough to + # ground-truth box center. + center_dists = (anchor_centers[None, :, :] - gt_centers[:, None, :]).abs_() + sampling_regions = self.center_sampling_radius * anchor_sizes[None, :] + + match_quality_matrix = center_dists.max(dim=2).values < sampling_regions + + pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes) + pairwise_dist = pairwise_dist.permute(1, 0, 2) # (M, R, 4) + + # The original FCOS anchor matching rule: anchor point must be inside GT. + match_quality_matrix &= pairwise_dist.min(dim=2).values > 0 + + # Multilevel anchor matching in FCOS: each anchor is only responsible + # for certain scale range. + pairwise_dist = pairwise_dist.max(dim=2).values + match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & ( + pairwise_dist < upper_bound[None, :] + ) + # Match the GT box with minimum area, if there are multiple GT matches. + gt_areas = gt_boxes.area() # (M, ) + + match_quality_matrix = match_quality_matrix.to(torch.float32) + match_quality_matrix *= 1e8 - gt_areas[:, None] + return match_quality_matrix # (M, R) + + @torch.no_grad() + def label_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]): + """ + Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS + anchor matching rule. + + Unlike RetinaNet, there are no ignored anchors. + """ + + gt_labels, matched_gt_boxes = [], [] + + for inst in gt_instances: + if len(inst) > 0: + match_quality_matrix = self._match_anchors(inst.gt_boxes, anchors) + + # Find matched ground-truth box per anchor. Un-matched anchors are + # assigned -1. This is equivalent to using an anchor matcher as used + # in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])` + match_quality, matched_idxs = match_quality_matrix.max(dim=0) + matched_idxs[match_quality < 1e-5] = -1 + + matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip(min=0)] + gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)] + + # Anchors with matched_idxs = -1 are labeled background. + gt_labels_i[matched_idxs < 0] = self.num_classes + else: + matched_gt_boxes_i = torch.zeros_like(Boxes.cat(anchors).tensor) + gt_labels_i = torch.full( + (len(matched_gt_boxes_i),), + fill_value=self.num_classes, + dtype=torch.long, + device=matched_gt_boxes_i.device, + ) + + gt_labels.append(gt_labels_i) + matched_gt_boxes.append(matched_gt_boxes_i) + + return gt_labels, matched_gt_boxes + + def losses( + self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness + ): + """ + This method is almost identical to :meth:`RetinaNet.losses`, with an extra + "loss_centerness" in the returned dict. + """ + num_images = len(gt_labels) + gt_labels = torch.stack(gt_labels) # (M, R) + + pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) + num_pos_anchors = pos_mask.sum().item() + get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) + normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 300) + + # classification and regression loss + gt_labels_target = F.one_hot(gt_labels, num_classes=self.num_classes + 1)[ + :, :, :-1 + ] # no loss for the last (background) class + loss_cls = sigmoid_focal_loss_jit( + torch.cat(pred_logits, dim=1), + gt_labels_target.to(pred_logits[0].dtype), + alpha=self.focal_loss_alpha, + gamma=self.focal_loss_gamma, + reduction="sum", + ) + + loss_box_reg = _dense_box_regression_loss( + anchors, + self.box2box_transform, + pred_anchor_deltas, + gt_boxes, + pos_mask, + box_reg_loss_type="giou", + ) + + ctrness_targets = self.compute_ctrness_targets(anchors, gt_boxes) # (M, R) + pred_centerness = torch.cat(pred_centerness, dim=1).squeeze(dim=2) # (M, R) + ctrness_loss = F.binary_cross_entropy_with_logits( + pred_centerness[pos_mask], ctrness_targets[pos_mask], reduction="sum" + ) + return { + "loss_fcos_cls": loss_cls / normalizer, + "loss_fcos_loc": loss_box_reg / normalizer, + "loss_fcos_ctr": ctrness_loss / normalizer, + } + + def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]): + anchors = Boxes.cat(anchors).tensor # Rx4 + reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes] + reg_targets = torch.stack(reg_targets, dim=0) # NxRx4 + if len(reg_targets) == 0: + return reg_targets.new_zeros(len(reg_targets)) + left_right = reg_targets[:, :, [0, 2]] + top_bottom = reg_targets[:, :, [1, 3]] + ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * ( + top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0] + ) + return torch.sqrt(ctrness) + + def forward_inference( + self, + images: ImageList, + features: List[torch.Tensor], + predictions: List[List[torch.Tensor]], + ): + pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions( + predictions, [self.num_classes, 4, 1] + ) + anchors = self.anchor_generator(features) + + results: List[Instances] = [] + for img_idx, image_size in enumerate(images.image_sizes): + scores_per_image = [ + # Multiply and sqrt centerness & classification scores + # (See eqn. 4 in https://arxiv.org/abs/2006.09214) + torch.sqrt(x[img_idx].sigmoid_() * y[img_idx].sigmoid_()) + for x, y in zip(pred_logits, pred_centerness) + ] + deltas_per_image = [x[img_idx] for x in pred_anchor_deltas] + results_per_image = self.inference_single_image( + anchors, scores_per_image, deltas_per_image, image_size + ) + results.append(results_per_image) + return results + + def inference_single_image( + self, + anchors: List[Boxes], + box_cls: List[torch.Tensor], + box_delta: List[torch.Tensor], + image_size: Tuple[int, int], + ): + """ + Identical to :meth:`RetinaNet.inference_single_image. + """ + pred = self._decode_multi_level_predictions( + anchors, + box_cls, + box_delta, + self.test_score_thresh, + self.test_topk_candidates, + image_size, + ) + keep = batched_nms( + pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh + ) + return pred[keep[: self.max_detections_per_image]] + + +class FCOSHead(RetinaNetHead): + """ + The head used in :paper:`fcos`. It adds an additional centerness + prediction branch on top of :class:`RetinaNetHead`. + """ + + def __init__(self, *, input_shape: List[ShapeSpec], conv_dims: List[int], **kwargs): + super().__init__(input_shape=input_shape, conv_dims=conv_dims, num_anchors=1, **kwargs) + # Unlike original FCOS, we do not add an additional learnable scale layer + # because it's found to have no benefits after normalizing regression targets by stride. + self._num_features = len(input_shape) + self.ctrness = nn.Conv2d(conv_dims[-1], 1, kernel_size=3, stride=1, padding=1) + torch.nn.init.normal_(self.ctrness.weight, std=0.01) + torch.nn.init.constant_(self.ctrness.bias, 0) + + def forward(self, features): + assert len(features) == self._num_features + logits = [] + bbox_reg = [] + ctrness = [] + for feature in features: + logits.append(self.cls_score(self.cls_subnet(feature))) + bbox_feature = self.bbox_subnet(feature) + bbox_reg.append(self.bbox_pred(bbox_feature)) + ctrness.append(self.ctrness(bbox_feature)) + return logits, bbox_reg, ctrness diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/panoptic_fpn.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/panoptic_fpn.py new file mode 100644 index 00000000..b31e1c8d --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/panoptic_fpn.py @@ -0,0 +1,269 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. + +import logging +from typing import Dict, List +import torch +from torch import nn + +from detectron2.config import configurable +from detectron2.structures import ImageList + +from ..postprocessing import detector_postprocess, sem_seg_postprocess +from .build import META_ARCH_REGISTRY +from .rcnn import GeneralizedRCNN +from .semantic_seg import build_sem_seg_head + +__all__ = ["PanopticFPN"] + + +@META_ARCH_REGISTRY.register() +class PanopticFPN(GeneralizedRCNN): + """ + Implement the paper :paper:`PanopticFPN`. + """ + + @configurable + def __init__( + self, + *, + sem_seg_head: nn.Module, + combine_overlap_thresh: float = 0.5, + combine_stuff_area_thresh: float = 4096, + combine_instances_score_thresh: float = 0.5, + **kwargs, + ): + """ + NOTE: this interface is experimental. + + Args: + sem_seg_head: a module for the semantic segmentation head. + combine_overlap_thresh: combine masks into one instances if + they have enough overlap + combine_stuff_area_thresh: ignore stuff areas smaller than this threshold + combine_instances_score_thresh: ignore instances whose score is + smaller than this threshold + + Other arguments are the same as :class:`GeneralizedRCNN`. + """ + super().__init__(**kwargs) + self.sem_seg_head = sem_seg_head + # options when combining instance & semantic outputs + self.combine_overlap_thresh = combine_overlap_thresh + self.combine_stuff_area_thresh = combine_stuff_area_thresh + self.combine_instances_score_thresh = combine_instances_score_thresh + + @classmethod + def from_config(cls, cfg): + ret = super().from_config(cfg) + ret.update( + { + "combine_overlap_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH, + "combine_stuff_area_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT, + "combine_instances_score_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH, # noqa + } + ) + ret["sem_seg_head"] = build_sem_seg_head(cfg, ret["backbone"].output_shape()) + logger = logging.getLogger(__name__) + if not cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED: + logger.warning( + "PANOPTIC_FPN.COMBINED.ENABLED is no longer used. " + " model.inference(do_postprocess=) should be used to toggle postprocessing." + ) + if cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT != 1.0: + w = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT + logger.warning( + "PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT should be replaced by weights on each ROI head." + ) + + def update_weight(x): + if isinstance(x, dict): + return {k: v * w for k, v in x.items()} + else: + return x * w + + roi_heads = ret["roi_heads"] + roi_heads.box_predictor.loss_weight = update_weight(roi_heads.box_predictor.loss_weight) + roi_heads.mask_head.loss_weight = update_weight(roi_heads.mask_head.loss_weight) + return ret + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper`. + Each item in the list contains the inputs for one image. + + For now, each item in the list is a dict that contains: + + * "image": Tensor, image in (C, H, W) format. + * "instances": Instances + * "sem_seg": semantic segmentation ground truth. + * Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + + Returns: + list[dict]: + each dict has the results for one image. The dict contains the following keys: + + * "instances": see :meth:`GeneralizedRCNN.forward` for its format. + * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format. + * "panoptic_seg": See the return value of + :func:`combine_semantic_and_instance_outputs` for its format. + """ + if not self.training: + return self.inference(batched_inputs) + images = self.preprocess_image(batched_inputs) + features = self.backbone(images.tensor) + + assert "sem_seg" in batched_inputs[0] + gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] + gt_sem_seg = ImageList.from_tensors( + gt_sem_seg, + self.backbone.size_divisibility, + self.sem_seg_head.ignore_value, + self.backbone.padding_constraints, + ).tensor + sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg) + + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) + detector_results, detector_losses = self.roi_heads( + images, features, proposals, gt_instances + ) + + losses = sem_seg_losses + losses.update(proposal_losses) + losses.update(detector_losses) + return losses + + def inference(self, batched_inputs: List[Dict[str, torch.Tensor]], do_postprocess: bool = True): + """ + Run inference on the given inputs. + + Args: + batched_inputs (list[dict]): same as in :meth:`forward` + do_postprocess (bool): whether to apply post-processing on the outputs. + + Returns: + When do_postprocess=True, see docs in :meth:`forward`. + Otherwise, returns a (list[Instances], list[Tensor]) that contains + the raw detector outputs, and raw semantic segmentation outputs. + """ + images = self.preprocess_image(batched_inputs) + features = self.backbone(images.tensor) + sem_seg_results, sem_seg_losses = self.sem_seg_head(features, None) + proposals, _ = self.proposal_generator(images, features, None) + detector_results, _ = self.roi_heads(images, features, proposals, None) + + if do_postprocess: + processed_results = [] + for sem_seg_result, detector_result, input_per_image, image_size in zip( + sem_seg_results, detector_results, batched_inputs, images.image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width) + detector_r = detector_postprocess(detector_result, height, width) + + processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r}) + + panoptic_r = combine_semantic_and_instance_outputs( + detector_r, + sem_seg_r.argmax(dim=0), + self.combine_overlap_thresh, + self.combine_stuff_area_thresh, + self.combine_instances_score_thresh, + ) + processed_results[-1]["panoptic_seg"] = panoptic_r + return processed_results + else: + return detector_results, sem_seg_results + + +def combine_semantic_and_instance_outputs( + instance_results, + semantic_results, + overlap_threshold, + stuff_area_thresh, + instances_score_thresh, +): + """ + Implement a simple combining logic following + "combine_semantic_and_instance_predictions.py" in panopticapi + to produce panoptic segmentation outputs. + + Args: + instance_results: output of :func:`detector_postprocess`. + semantic_results: an (H, W) tensor, each element is the contiguous semantic + category id + + Returns: + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. + segments_info (list[dict]): Describe each segment in `panoptic_seg`. + Each dict contains keys "id", "category_id", "isthing". + """ + panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32) + + # sort instance outputs by scores + sorted_inds = torch.argsort(-instance_results.scores) + + current_segment_id = 0 + segments_info = [] + + instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device) + + # Add instances one-by-one, check for overlaps with existing ones + for inst_id in sorted_inds: + score = instance_results.scores[inst_id].item() + if score < instances_score_thresh: + break + mask = instance_masks[inst_id] # H,W + mask_area = mask.sum().item() + + if mask_area == 0: + continue + + intersect = (mask > 0) & (panoptic_seg > 0) + intersect_area = intersect.sum().item() + + if intersect_area * 1.0 / mask_area > overlap_threshold: + continue + + if intersect_area > 0: + mask = mask & (panoptic_seg == 0) + + current_segment_id += 1 + panoptic_seg[mask] = current_segment_id + segments_info.append( + { + "id": current_segment_id, + "isthing": True, + "score": score, + "category_id": instance_results.pred_classes[inst_id].item(), + "instance_id": inst_id.item(), + } + ) + + # Add semantic results to remaining empty areas + semantic_labels = torch.unique(semantic_results).cpu().tolist() + for semantic_label in semantic_labels: + if semantic_label == 0: # 0 is a special "thing" class + continue + mask = (semantic_results == semantic_label) & (panoptic_seg == 0) + mask_area = mask.sum().item() + if mask_area < stuff_area_thresh: + continue + + current_segment_id += 1 + panoptic_seg[mask] = current_segment_id + segments_info.append( + { + "id": current_segment_id, + "isthing": False, + "category_id": semantic_label, + "area": mask_area, + } + ) + + return panoptic_seg, segments_info diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/rcnn.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/rcnn.py new file mode 100644 index 00000000..aa4774b2 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/rcnn.py @@ -0,0 +1,389 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import copy +import logging +import warnings + +import numpy as np +from typing import Dict, List, Optional, Tuple +import torch +from torch import nn + +from detectron2.config import configurable +from detectron2.data.detection_utils import convert_image_to_rgb +from detectron2.layers import move_device_like +from detectron2.structures import ImageList, Instances +from detectron2.utils.events import get_event_storage +from detectron2.utils.logger import log_first_n + +from ..backbone import Backbone, build_backbone +from ..postprocessing import detector_postprocess +from ..proposal_generator import build_proposal_generator +from ..roi_heads import build_roi_heads +from .build import META_ARCH_REGISTRY + +import logging +logger = logging.getLogger(__name__) + + +__all__ = ["GeneralizedRCNN", "ProposalNetwork"] + + +@META_ARCH_REGISTRY.register() +class GeneralizedRCNN(nn.Module): + """ + Generalized R-CNN. Any models that contains the following three components: + 1. Per-image feature extraction (aka backbone) + 2. Region proposal generation + 3. Per-region feature extraction and prediction + """ + + @configurable + def __init__( + self, + *, + backbone: Backbone, + proposal_generator: nn.Module, + roi_heads: nn.Module, + pixel_mean: Tuple[float], + pixel_std: Tuple[float], + input_format: Optional[str] = None, + vis_period: int = 0, + ): + """ + Args: + backbone: a backbone module, must follow detectron2's backbone interface + proposal_generator: a module that generates proposals using backbone features + roi_heads: a ROI head that performs per-region computation + pixel_mean, pixel_std: list or tuple with #channels element, representing + the per-channel mean and std to be used to normalize the input image + input_format: describe the meaning of channels of input. Needed by visualization + vis_period: the period to run visualization. Set to 0 to disable. + """ + super().__init__() + self.backbone = backbone + self.proposal_generator = proposal_generator + self.roi_heads = roi_heads + + self.input_format = input_format + self.vis_period = vis_period + if vis_period > 0: + assert input_format is not None, "input_format is required for visualization!" + + self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) + assert ( + self.pixel_mean.shape == self.pixel_std.shape + ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!" + + @classmethod + def from_config(cls, cfg): + backbone = build_backbone(cfg) + return { + "backbone": backbone, + "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), + "roi_heads": build_roi_heads(cfg, backbone.output_shape()), + "input_format": cfg.INPUT.FORMAT, + "vis_period": cfg.VIS_PERIOD, + "pixel_mean": cfg.MODEL.PIXEL_MEAN, + "pixel_std": cfg.MODEL.PIXEL_STD, + } + + @property + def device(self): + return self.pixel_mean.device + + def _move_to_current_device(self, x): + return move_device_like(x, self.pixel_mean) + + def visualize_training(self, batched_inputs, proposals): + """ + A function used to visualize images and proposals. It shows ground truth + bounding boxes on the original image and up to 20 top-scoring predicted + object proposals on the original image. Users can implement different + visualization functions for different models. + + Args: + batched_inputs (list): a list that contains input to the model. + proposals (list): a list that contains predicted proposals. Both + batched_inputs and proposals should have the same length. + """ + from detectron2.utils.visualizer import Visualizer + + storage = get_event_storage() + max_vis_prop = 20 + + for input, prop in zip(batched_inputs, proposals): + img = input["image"] + img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) + v_gt = Visualizer(img, None) + v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) + anno_img = v_gt.get_image() + box_size = min(len(prop.proposal_boxes), max_vis_prop) + v_pred = Visualizer(img, None) + v_pred = v_pred.overlay_instances( + boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() + ) + prop_img = v_pred.get_image() + vis_img = np.concatenate((anno_img, prop_img), axis=1) + vis_img = vis_img.transpose(2, 0, 1) + vis_name = "Left: GT bounding boxes; Right: Predicted proposals" + storage.put_image(vis_name, vis_img) + break # only visualize one image in a batch + + def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper` . + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + + * image: Tensor, image in (C, H, W) format. + * instances (optional): groundtruth :class:`Instances` + * proposals (optional): :class:`Instances`, precomputed proposals. + + Other information that's included in the original dicts, such as: + + * "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + + Returns: + list[dict]: + Each dict is the output for one input image. + The dict contains one key "instances" whose value is a :class:`Instances`. + The :class:`Instances` object has the following keys: + "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" + """ + if not self.training: + return self.inference(batched_inputs) + + images = self.preprocess_image(batched_inputs) + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + + features = self.backbone(images.tensor) + + for k, v in features.items(): + # feat_isnan = torch.isnan(v).any() + # feat_isinf = torch.isinf(v).any() + # if feat_isnan or feat_isinf: + # logger.info("============ feat_isnan={}, feat_isinf={} ===============".format(feat_isnan, feat_isinf)) + features[k] = v.float() + + with torch.cuda.amp.autocast(enabled=False): + if self.proposal_generator is not None: + proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + proposal_losses = {} + + _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) + if self.vis_period > 0: + storage = get_event_storage() + if storage.iter % self.vis_period == 0: + self.visualize_training(batched_inputs, proposals) + + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + return losses + + def inference( + self, + batched_inputs: List[Dict[str, torch.Tensor]], + detected_instances: Optional[List[Instances]] = None, + do_postprocess: bool = True, + keep_all_before_merge: bool = False, + ): + """ + Run inference on the given inputs. + + Args: + batched_inputs (list[dict]): same as in :meth:`forward` + detected_instances (None or list[Instances]): if not None, it + contains an `Instances` object per image. The `Instances` + object contains "pred_boxes" and "pred_classes" which are + known boxes in the image. + The inference will then skip the detection of bounding boxes, + and only predict other per-ROI outputs. + do_postprocess (bool): whether to apply post-processing on the outputs. + keep_all_before_merge + + Returns: + When do_postprocess=True, same as in :meth:`forward`. + Otherwise, a list[Instances] containing raw network outputs. + """ + assert not self.training + + images = self.preprocess_image(batched_inputs) + features = self.backbone(images.tensor) + + if detected_instances is None: + if self.proposal_generator is not None: + proposals, _ = self.proposal_generator(images, features, None) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + + results, _ = self.roi_heads(images, features, proposals, None, keep_all_before_merge=keep_all_before_merge) + else: + detected_instances = [x.to(self.device) for x in detected_instances] + results = self.roi_heads.forward_with_given_boxes(features, detected_instances) + # optionally update score using maskness + if self.roi_heads.maskness_thresh is not None: + for pred_inst in results: + # pred_inst._fields.keys(): dict_keys(['pred_boxes', 'scores', 'pred_classes', 'pred_masks']) + pred_masks = pred_inst.pred_masks # (num_inst, 1, 28, 28) + scores = pred_inst.scores # (num_inst, ) + # sigmoid already applied + binary_masks = pred_masks > self.roi_heads.maskness_thresh + seg_scores = (pred_masks * binary_masks.float()).sum((1, 2, 3)) / binary_masks.sum((1, 2, 3)) + seg_scores[binary_masks.sum((1, 2, 3)) == 0] = 0 # avoid nan + updated_scores = scores * seg_scores + pred_inst.set('scores', updated_scores) + # update order + scores, indices = updated_scores.sort(descending=True) + pred_inst = pred_inst[indices] + assert (pred_inst.scores == scores).all() + + if do_postprocess: + assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." + return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) + return results + + def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]): + """ + Normalize, pad and batch the input images. + """ + images = [self._move_to_current_device(x["image"]) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + + # TODO: modify square_size when necessary to avoid negative padding + max_size = 0 + for img in images: + _, h, w = img.shape + if max(h, w) > max_size: + max_size = max(h, w) + padding_constraints = copy.deepcopy(self.backbone.padding_constraints) + if 'square_size' in self.backbone.padding_constraints: + square_size = self.backbone.padding_constraints['square_size'] + if square_size < max_size and square_size != 0: + warnings.warn("square_size={}, is smaller than max_size={} in batch".format( + self.backbone.padding_constraints['square_size'], max_size)) + padding_constraints['square_size'] = max_size + + images = ImageList.from_tensors( + images, + self.backbone.size_divisibility, + padding_constraints=padding_constraints, + ) + return images + + @staticmethod + def _postprocess(instances, batched_inputs: List[Dict[str, torch.Tensor]], image_sizes): + """ + Rescale the output instances to the target size. + """ + # note: private function; subject to changes + processed_results = [] + for results_per_image, input_per_image, image_size in zip( + instances, batched_inputs, image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = detector_postprocess(results_per_image, height, width) + processed_results.append({"instances": r}) + return processed_results + + +@META_ARCH_REGISTRY.register() +class ProposalNetwork(nn.Module): + """ + A meta architecture that only predicts object proposals. + """ + + @configurable + def __init__( + self, + *, + backbone: Backbone, + proposal_generator: nn.Module, + pixel_mean: Tuple[float], + pixel_std: Tuple[float], + ): + """ + Args: + backbone: a backbone module, must follow detectron2's backbone interface + proposal_generator: a module that generates proposals using backbone features + pixel_mean, pixel_std: list or tuple with #channels element, representing + the per-channel mean and std to be used to normalize the input image + """ + super().__init__() + self.backbone = backbone + self.proposal_generator = proposal_generator + self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) + + @classmethod + def from_config(cls, cfg): + backbone = build_backbone(cfg) + return { + "backbone": backbone, + "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), + "pixel_mean": cfg.MODEL.PIXEL_MEAN, + "pixel_std": cfg.MODEL.PIXEL_STD, + } + + @property + def device(self): + return self.pixel_mean.device + + def _move_to_current_device(self, x): + return move_device_like(x, self.pixel_mean) + + def forward(self, batched_inputs): + """ + Args: + Same as in :class:`GeneralizedRCNN.forward` + + Returns: + list[dict]: + Each dict is the output for one input image. + The dict contains one key "proposals" whose value is a + :class:`Instances` with keys "proposal_boxes" and "objectness_logits". + """ + images = [self._move_to_current_device(x["image"]) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors( + images, + self.backbone.size_divisibility, + padding_constraints=self.backbone.padding_constraints, + ) + features = self.backbone(images.tensor) + + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + elif "targets" in batched_inputs[0]: + log_first_n( + logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 + ) + gt_instances = [x["targets"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) + # In training, the proposals are not useful at all but we generate them anyway. + # This makes RPN-only models about 5% slower. + if self.training: + return proposal_losses + + processed_results = [] + for results_per_image, input_per_image, image_size in zip( + proposals, batched_inputs, images.image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = detector_postprocess(results_per_image, height, width) + processed_results.append({"proposals": r}) + return processed_results diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/retinanet.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/retinanet.py new file mode 100644 index 00000000..bd72a8e7 --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/retinanet.py @@ -0,0 +1,439 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging +import math +from typing import List, Tuple +import torch +from fvcore.nn import sigmoid_focal_loss_jit +from torch import Tensor, nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import CycleBatchNormList, ShapeSpec, batched_nms, cat, get_norm +from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou +from detectron2.utils.events import get_event_storage + +from ..anchor_generator import build_anchor_generator +from ..backbone import Backbone, build_backbone +from ..box_regression import Box2BoxTransform, _dense_box_regression_loss +from ..matcher import Matcher +from .build import META_ARCH_REGISTRY +from .dense_detector import DenseDetector, permute_to_N_HWA_K # noqa + +__all__ = ["RetinaNet"] + + +logger = logging.getLogger(__name__) + + +@META_ARCH_REGISTRY.register() +class RetinaNet(DenseDetector): + """ + Implement RetinaNet in :paper:`RetinaNet`. + """ + + @configurable + def __init__( + self, + *, + backbone: Backbone, + head: nn.Module, + head_in_features, + anchor_generator, + box2box_transform, + anchor_matcher, + num_classes, + focal_loss_alpha=0.25, + focal_loss_gamma=2.0, + smooth_l1_beta=0.0, + box_reg_loss_type="smooth_l1", + test_score_thresh=0.05, + test_topk_candidates=1000, + test_nms_thresh=0.5, + max_detections_per_image=100, + pixel_mean, + pixel_std, + vis_period=0, + input_format="BGR", + ): + """ + NOTE: this interface is experimental. + + Args: + backbone: a backbone module, must follow detectron2's backbone interface + head (nn.Module): a module that predicts logits and regression deltas + for each level from a list of per-level features + head_in_features (Tuple[str]): Names of the input feature maps to be used in head + anchor_generator (nn.Module): a module that creates anchors from a + list of features. Usually an instance of :class:`AnchorGenerator` + box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to + instance boxes + anchor_matcher (Matcher): label the anchors by matching them with ground truth. + num_classes (int): number of classes. Used to label background proposals. + + # Loss parameters: + focal_loss_alpha (float): focal_loss_alpha + focal_loss_gamma (float): focal_loss_gamma + smooth_l1_beta (float): smooth_l1_beta + box_reg_loss_type (str): Options are "smooth_l1", "giou", "diou", "ciou" + + # Inference parameters: + test_score_thresh (float): Inference cls score threshold, only anchors with + score > INFERENCE_TH are considered for inference (to improve speed) + test_topk_candidates (int): Select topk candidates before NMS + test_nms_thresh (float): Overlap threshold used for non-maximum suppression + (suppress boxes with IoU >= this threshold) + max_detections_per_image (int): + Maximum number of detections to return per image during inference + (100 is based on the limit established for the COCO dataset). + + pixel_mean, pixel_std: see :class:`DenseDetector`. + """ + super().__init__( + backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std + ) + self.num_classes = num_classes + + # Anchors + self.anchor_generator = anchor_generator + self.box2box_transform = box2box_transform + self.anchor_matcher = anchor_matcher + + # Loss parameters: + self.focal_loss_alpha = focal_loss_alpha + self.focal_loss_gamma = focal_loss_gamma + self.smooth_l1_beta = smooth_l1_beta + self.box_reg_loss_type = box_reg_loss_type + # Inference parameters: + self.test_score_thresh = test_score_thresh + self.test_topk_candidates = test_topk_candidates + self.test_nms_thresh = test_nms_thresh + self.max_detections_per_image = max_detections_per_image + # Vis parameters + self.vis_period = vis_period + self.input_format = input_format + + @classmethod + def from_config(cls, cfg): + backbone = build_backbone(cfg) + backbone_shape = backbone.output_shape() + feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES] + head = RetinaNetHead(cfg, feature_shapes) + anchor_generator = build_anchor_generator(cfg, feature_shapes) + return { + "backbone": backbone, + "head": head, + "anchor_generator": anchor_generator, + "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS), + "anchor_matcher": Matcher( + cfg.MODEL.RETINANET.IOU_THRESHOLDS, + cfg.MODEL.RETINANET.IOU_LABELS, + allow_low_quality_matches=True, + ), + "pixel_mean": cfg.MODEL.PIXEL_MEAN, + "pixel_std": cfg.MODEL.PIXEL_STD, + "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES, + "head_in_features": cfg.MODEL.RETINANET.IN_FEATURES, + # Loss parameters: + "focal_loss_alpha": cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA, + "focal_loss_gamma": cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA, + "smooth_l1_beta": cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA, + "box_reg_loss_type": cfg.MODEL.RETINANET.BBOX_REG_LOSS_TYPE, + # Inference parameters: + "test_score_thresh": cfg.MODEL.RETINANET.SCORE_THRESH_TEST, + "test_topk_candidates": cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST, + "test_nms_thresh": cfg.MODEL.RETINANET.NMS_THRESH_TEST, + "max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, + # Vis parameters + "vis_period": cfg.VIS_PERIOD, + "input_format": cfg.INPUT.FORMAT, + } + + def forward_training(self, images, features, predictions, gt_instances): + # Transpose the Hi*Wi*A dimension to the middle: + pred_logits, pred_anchor_deltas = self._transpose_dense_predictions( + predictions, [self.num_classes, 4] + ) + anchors = self.anchor_generator(features) + gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances) + return self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes) + + def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes): + """ + Args: + anchors (list[Boxes]): a list of #feature level Boxes + gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`. + Their shapes are (N, R) and (N, R, 4), respectively, where R is + the total number of anchors across levels, i.e. sum(Hi x Wi x Ai) + pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the + list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4). + Where K is the number of classes used in `pred_logits`. + + Returns: + dict[str, Tensor]: + mapping from a named loss to a scalar tensor storing the loss. + Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" + """ + num_images = len(gt_labels) + gt_labels = torch.stack(gt_labels) # (N, R) + + valid_mask = gt_labels >= 0 + pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) + num_pos_anchors = pos_mask.sum().item() + get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) + normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 100) + + # classification and regression loss + gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[ + :, :-1 + ] # no loss for the last (background) class + loss_cls = sigmoid_focal_loss_jit( + cat(pred_logits, dim=1)[valid_mask], + gt_labels_target.to(pred_logits[0].dtype), + alpha=self.focal_loss_alpha, + gamma=self.focal_loss_gamma, + reduction="sum", + ) + + loss_box_reg = _dense_box_regression_loss( + anchors, + self.box2box_transform, + pred_anchor_deltas, + gt_boxes, + pos_mask, + box_reg_loss_type=self.box_reg_loss_type, + smooth_l1_beta=self.smooth_l1_beta, + ) + + return { + "loss_cls": loss_cls / normalizer, + "loss_box_reg": loss_box_reg / normalizer, + } + + @torch.no_grad() + def label_anchors(self, anchors, gt_instances): + """ + Args: + anchors (list[Boxes]): A list of #feature level Boxes. + The Boxes contains anchors of this image on the specific feature level. + gt_instances (list[Instances]): a list of N `Instances`s. The i-th + `Instances` contains the ground-truth per-instance annotations + for the i-th input image. + + Returns: + list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is + the total number of anchors across all feature maps (sum(Hi * Wi * A)). + Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background. + + list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors + across feature maps. The values are the matched gt boxes for each anchor. + Values are undefined for those anchors not labeled as foreground. + """ + anchors = Boxes.cat(anchors) # Rx4 + + gt_labels = [] + matched_gt_boxes = [] + for gt_per_image in gt_instances: + match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors) + matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix) + del match_quality_matrix + + if len(gt_per_image) > 0: + matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs] + + gt_labels_i = gt_per_image.gt_classes[matched_idxs] + # Anchors with label 0 are treated as background. + gt_labels_i[anchor_labels == 0] = self.num_classes + # Anchors with label -1 are ignored. + gt_labels_i[anchor_labels == -1] = -1 + else: + matched_gt_boxes_i = torch.zeros_like(anchors.tensor) + gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes + + gt_labels.append(gt_labels_i) + matched_gt_boxes.append(matched_gt_boxes_i) + + return gt_labels, matched_gt_boxes + + def forward_inference( + self, images: ImageList, features: List[Tensor], predictions: List[List[Tensor]] + ): + pred_logits, pred_anchor_deltas = self._transpose_dense_predictions( + predictions, [self.num_classes, 4] + ) + anchors = self.anchor_generator(features) + + results: List[Instances] = [] + for img_idx, image_size in enumerate(images.image_sizes): + scores_per_image = [x[img_idx].sigmoid_() for x in pred_logits] + deltas_per_image = [x[img_idx] for x in pred_anchor_deltas] + results_per_image = self.inference_single_image( + anchors, scores_per_image, deltas_per_image, image_size + ) + results.append(results_per_image) + return results + + def inference_single_image( + self, + anchors: List[Boxes], + box_cls: List[Tensor], + box_delta: List[Tensor], + image_size: Tuple[int, int], + ): + """ + Single-image inference. Return bounding-box detection results by thresholding + on scores and applying non-maximum suppression (NMS). + + Arguments: + anchors (list[Boxes]): list of #feature levels. Each entry contains + a Boxes object, which contains all the anchors in that feature level. + box_cls (list[Tensor]): list of #feature levels. Each entry contains + tensor of size (H x W x A, K) + box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. + image_size (tuple(H, W)): a tuple of the image height and width. + + Returns: + Same as `inference`, but for only one image. + """ + pred = self._decode_multi_level_predictions( + anchors, + box_cls, + box_delta, + self.test_score_thresh, + self.test_topk_candidates, + image_size, + ) + keep = batched_nms( # per-class NMS + pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh + ) + return pred[keep[: self.max_detections_per_image]] + + +class RetinaNetHead(nn.Module): + """ + The head used in RetinaNet for object classification and box regression. + It has two subnets for the two tasks, with a common structure but separate parameters. + """ + + @configurable + def __init__( + self, + *, + input_shape: List[ShapeSpec], + num_classes, + num_anchors, + conv_dims: List[int], + norm="", + prior_prob=0.01, + ): + """ + NOTE: this interface is experimental. + + Args: + input_shape (List[ShapeSpec]): input shape + num_classes (int): number of classes. Used to label background proposals. + num_anchors (int): number of generated anchors + conv_dims (List[int]): dimensions for each convolution layer + norm (str or callable): + Normalization for conv layers except for the two output layers. + See :func:`detectron2.layers.get_norm` for supported types. + prior_prob (float): Prior weight for computing bias + """ + super().__init__() + + self._num_features = len(input_shape) + if norm == "BN" or norm == "SyncBN": + logger.info( + f"Using domain-specific {norm} in RetinaNetHead with len={self._num_features}." + ) + bn_class = nn.BatchNorm2d if norm == "BN" else nn.SyncBatchNorm + + def norm(c): + return CycleBatchNormList( + length=self._num_features, bn_class=bn_class, num_features=c + ) + + else: + norm_name = str(type(get_norm(norm, 32))) + if "BN" in norm_name: + logger.warning( + f"Shared BatchNorm (type={norm_name}) may not work well in RetinaNetHead." + ) + + cls_subnet = [] + bbox_subnet = [] + for in_channels, out_channels in zip( + [input_shape[0].channels] + list(conv_dims), conv_dims + ): + cls_subnet.append( + nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + ) + if norm: + cls_subnet.append(get_norm(norm, out_channels)) + cls_subnet.append(nn.ReLU()) + bbox_subnet.append( + nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + ) + if norm: + bbox_subnet.append(get_norm(norm, out_channels)) + bbox_subnet.append(nn.ReLU()) + + self.cls_subnet = nn.Sequential(*cls_subnet) + self.bbox_subnet = nn.Sequential(*bbox_subnet) + self.cls_score = nn.Conv2d( + conv_dims[-1], num_anchors * num_classes, kernel_size=3, stride=1, padding=1 + ) + self.bbox_pred = nn.Conv2d( + conv_dims[-1], num_anchors * 4, kernel_size=3, stride=1, padding=1 + ) + + # Initialization + for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]: + for layer in modules.modules(): + if isinstance(layer, nn.Conv2d): + torch.nn.init.normal_(layer.weight, mean=0, std=0.01) + torch.nn.init.constant_(layer.bias, 0) + + # Use prior in model initialization to improve stability + bias_value = -(math.log((1 - prior_prob) / prior_prob)) + torch.nn.init.constant_(self.cls_score.bias, bias_value) + + @classmethod + def from_config(cls, cfg, input_shape: List[ShapeSpec]): + num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors + assert ( + len(set(num_anchors)) == 1 + ), "Using different number of anchors between levels is not currently supported!" + num_anchors = num_anchors[0] + + return { + "input_shape": input_shape, + "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES, + "conv_dims": [input_shape[0].channels] * cfg.MODEL.RETINANET.NUM_CONVS, + "prior_prob": cfg.MODEL.RETINANET.PRIOR_PROB, + "norm": cfg.MODEL.RETINANET.NORM, + "num_anchors": num_anchors, + } + + def forward(self, features: List[Tensor]): + """ + Arguments: + features (list[Tensor]): FPN feature map tensors in high to low resolution. + Each tensor in the list correspond to different feature levels. + + Returns: + logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi). + The tensor predicts the classification probability + at each spatial position for each of the A anchors and K object + classes. + bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi). + The tensor predicts 4-vector (dx,dy,dw,dh) box + regression values for every anchor. These values are the + relative offset between the anchor and the ground truth box. + """ + assert len(features) == self._num_features + logits = [] + bbox_reg = [] + for feature in features: + logits.append(self.cls_score(self.cls_subnet(feature))) + bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature))) + return logits, bbox_reg diff --git a/EVA/EVA-02/det/detectron2/modeling/meta_arch/semantic_seg.py b/EVA/EVA-02/det/detectron2/modeling/meta_arch/semantic_seg.py new file mode 100644 index 00000000..fefbecfb --- /dev/null +++ b/EVA/EVA-02/det/detectron2/modeling/meta_arch/semantic_seg.py @@ -0,0 +1,267 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +from typing import Callable, Dict, Optional, Tuple, Union +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Conv2d, ShapeSpec, get_norm +from detectron2.structures import ImageList +from detectron2.utils.registry import Registry + +from ..backbone import Backbone, build_backbone +from ..postprocessing import sem_seg_postprocess +from .build import META_ARCH_REGISTRY + +__all__ = [ + "SemanticSegmentor", + "SEM_SEG_HEADS_REGISTRY", + "SemSegFPNHead", + "build_sem_seg_head", +] + + +SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS") +SEM_SEG_HEADS_REGISTRY.__doc__ = """ +Registry for semantic segmentation heads, which make semantic segmentation predictions +from feature maps. +""" + + +@META_ARCH_REGISTRY.register() +class SemanticSegmentor(nn.Module): + """ + Main class for semantic segmentation architectures. + """ + + @configurable + def __init__( + self, + *, + backbone: Backbone, + sem_seg_head: nn.Module, + pixel_mean: Tuple[float], + pixel_std: Tuple[float], + ): + """ + Args: + backbone: a backbone module, must follow detectron2's backbone interface + sem_seg_head: a module that predicts semantic segmentation from backbone features + pixel_mean, pixel_std: list or tuple with #channels element, representing + the per-channel mean and std to be used to normalize the input image + """ + super().__init__() + self.backbone = backbone + self.sem_seg_head = sem_seg_head + self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) + + @classmethod + def from_config(cls, cfg): + backbone = build_backbone(cfg) + sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) + return { + "backbone": backbone, + "sem_seg_head": sem_seg_head, + "pixel_mean": cfg.MODEL.PIXEL_MEAN, + "pixel_std": cfg.MODEL.PIXEL_STD, + } + + @property + def device(self): + return self.pixel_mean.device + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper`. + Each item in the list contains the inputs for one image. + + For now, each item in the list is a dict that contains: + + * "image": Tensor, image in (C, H, W) format. + * "sem_seg": semantic segmentation ground truth + * Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model (may be different + from input resolution), used in inference. + + + Returns: + list[dict]: + Each dict is the output for one input image. + The dict contains one key "sem_seg" whose value is a + Tensor that represents the + per-pixel segmentation prediced by the head. + The prediction has shape KxHxW that represents the logits of + each class for each pixel. + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors( + images, + self.backbone.size_divisibility, + padding_constraints=self.backbone.padding_constraints, + ) + + features = self.backbone(images.tensor) + + if "sem_seg" in batched_inputs[0]: + targets = [x["sem_seg"].to(self.device) for x in batched_inputs] + targets = ImageList.from_tensors( + targets, + self.backbone.size_divisibility, + self.sem_seg_head.ignore_value, + self.backbone.padding_constraints, + ).tensor + else: + targets = None + results, losses = self.sem_seg_head(features, targets) + + if self.training: + return losses + + processed_results = [] + for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = sem_seg_postprocess(result, image_size, height, width) + processed_results.append({"sem_seg": r}) + return processed_results + + +def build_sem_seg_head(cfg, input_shape): + """ + Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`. + """ + name = cfg.MODEL.SEM_SEG_HEAD.NAME + return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) + + +@SEM_SEG_HEADS_REGISTRY.register() +class SemSegFPNHead(nn.Module): + """ + A semantic segmentation head described in :paper:`PanopticFPN`. + It takes a list of FPN features as input, and applies a sequence of + 3x3 convs and upsampling to scale all of them to the stride defined by + ``common_stride``. Then these features are added and used to make final + predictions by another 1x1 conv layer. + """ + + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + num_classes: int, + conv_dims: int, + common_stride: int, + loss_weight: float = 1.0, + norm: Optional[Union[str, Callable]] = None, + ignore_value: int = -1, + ): + """ + NOTE: this interface is experimental. + + Args: + input_shape: shapes (channels and stride) of the input features + num_classes: number of classes to predict + conv_dims: number of output channels for the intermediate conv layers. + common_stride: the common stride that all features will be upscaled to + loss_weight: loss weight + norm (str or callable): normalization for all conv layers + ignore_value: category id to be ignored during training. + """ + super().__init__() + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + if not len(input_shape): + raise ValueError("SemSegFPNHead(input_shape=) cannot be empty!") + self.in_features = [k for k, v in input_shape] + feature_strides = [v.stride for k, v in input_shape] + feature_channels = [v.channels for k, v in input_shape] + + self.ignore_value = ignore_value + self.common_stride = common_stride + self.loss_weight = loss_weight + + self.scale_heads = [] + for in_feature, stride, channels in zip( + self.in_features, feature_strides, feature_channels + ): + head_ops = [] + head_length = max(1, int(np.log2(stride) - np.log2(self.common_stride))) + for k in range(head_length): + norm_module = get_norm(norm, conv_dims) + conv = Conv2d( + channels if k == 0 else conv_dims, + conv_dims, + kernel_size=3, + stride=1, + padding=1, + bias=not norm, + norm=norm_module, + activation=F.relu, + ) + weight_init.c2_msra_fill(conv) + head_ops.append(conv) + if stride != self.common_stride: + head_ops.append( + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) + ) + self.scale_heads.append(nn.Sequential(*head_ops)) + self.add_module(in_feature, self.scale_heads[-1]) + self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) + weight_init.c2_msra_fill(self.predictor) + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + return { + "input_shape": { + k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES + }, + "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, + "conv_dims": cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM, + "common_stride": cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE, + "norm": cfg.MODEL.SEM_SEG_HEAD.NORM, + "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, + } + + def forward(self, features, targets=None): + """ + Returns: + In training, returns (None, dict of losses) + In inference, returns (CxHxW logits, {}) + """ + x = self.layers(features) + if self.training: + return None, self.losses(x, targets) + else: + x = F.interpolate( + x, scale_factor=self.common_stride, mode="bilinear", align_corners=False + ) + return x, {} + + def layers(self, features): + for i, f in enumerate(self.in_features): + if i == 0: + x = self.scale_heads[i](features[f]) + else: + x = x + self.scale_heads[i](features[f]) + x = self.predictor(x) + return x + + def losses(self, predictions, targets): + predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163 + predictions = F.interpolate( + predictions, + scale_factor=self.common_stride, + mode="bilinear", + align_corners=False, + ) + loss = F.cross_entropy( + predictions, targets, reduction="mean", ignore_index=self.ignore_value + ) + losses = {"loss_sem_seg": loss * self.loss_weight} + return losses diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py new file mode 100644 index 00000000..9dba2030 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py @@ -0,0 +1,95 @@ +from functools import partial +import torch.nn as nn +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling import MViT +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from ..common.coco_loader_lsj import dataloader + +model = model_zoo.get_config("common/models/mask_rcnn_fpn.py").model +constants = model_zoo.get_config("common/data/constants.py").constants +model.pixel_mean = constants.imagenet_rgb256_mean +model.pixel_std = constants.imagenet_rgb256_std +model.input_format = "RGB" +model.backbone.bottom_up = L(MViT)( + embed_dim=96, + depth=24, + num_heads=1, + last_block_indexes=(1, 4, 20, 23), + residual_pooling=True, + drop_path_rate=0.4, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + out_features=("scale2", "scale3", "scale4", "scale5"), +) +model.backbone.in_features = "${.bottom_up.out_features}" +model.backbone.square_pad = 1024 + +# New heads and LN +model.backbone.norm = "LN" # Use LN in FPN +model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN" + +# 2conv in RPN: +model.proposal_generator.head.conv_dims = [-1, -1] + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth" + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} +optimizer.lr = 8e-5 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_h_in21k_36ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_h_in21k_36ep.py new file mode 100644 index 00000000..57704504 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_h_in21k_36ep.py @@ -0,0 +1,39 @@ +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler + +from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, +) + +model.backbone.bottom_up.embed_dim = 192 +model.backbone.bottom_up.depth = 80 +model.backbone.bottom_up.num_heads = 3 +model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79) +model.backbone.bottom_up.drop_path_rate = 0.6 +model.backbone.bottom_up.use_act_checkpoint = True + + +train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth" + + +# 36 epochs +train.max_iter = 67500 +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[ + 52500, + 62500, + 67500, + ], + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) +optimizer.lr = 1.6e-4 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py new file mode 100644 index 00000000..c64f0c18 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py @@ -0,0 +1,22 @@ +from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, +) + +model.backbone.bottom_up.embed_dim = 144 +model.backbone.bottom_up.depth = 48 +model.backbone.bottom_up.num_heads = 2 +model.backbone.bottom_up.last_block_indexes = (1, 7, 43, 47) +model.backbone.bottom_up.drop_path_rate = 0.5 + + +train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_L_in21k.pyth" + +train.max_iter = train.max_iter // 2 # 100ep -> 50ep +lr_multiplier.scheduler.milestones = [ + milestone // 2 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_swin_b_in21k_50ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_swin_b_in21k_50ep.py new file mode 100644 index 00000000..b2aad985 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_swin_b_in21k_50ep.py @@ -0,0 +1,50 @@ +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling import SwinTransformer + +from ..common.coco_loader_lsj import dataloader +from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import model + +model.backbone.bottom_up = L(SwinTransformer)( + depths=[2, 2, 18, 2], + drop_path_rate=0.4, + embed_dim=128, + num_heads=[4, 8, 16, 32], +) +model.backbone.in_features = ("p0", "p1", "p2", "p3") +model.backbone.square_pad = 1024 + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/swin/swin_base_patch4_window7_224_22k.pth" + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Rescale schedule +train.max_iter = train.max_iter // 2 # 100ep -> 50ep +lr_multiplier.scheduler.milestones = [ + milestone // 2 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter + + +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.lr = 4e-5 +optimizer.weight_decay = 0.05 +optimizer.params.overrides = {"relative_position_bias_table": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_swin_l_in21k_50ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_swin_l_in21k_50ep.py new file mode 100644 index 00000000..60bc917b --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_swin_l_in21k_50ep.py @@ -0,0 +1,15 @@ +from .cascade_mask_rcnn_swin_b_in21k_50ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, +) + +model.backbone.bottom_up.depths = [2, 2, 18, 2] +model.backbone.bottom_up.drop_path_rate = 0.4 +model.backbone.bottom_up.embed_dim = 192 +model.backbone.bottom_up.num_heads = [6, 12, 24, 48] + + +train.init_checkpoint = "detectron2://ImageNetPretrained/swin/swin_large_patch4_window7_224_22k.pth" diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_1536.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_1536.py new file mode 100644 index 00000000..311106fe --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_1536.py @@ -0,0 +1,46 @@ +from functools import partial + +from ..common.coco_loader_lsj_1536 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1536 # 1024 +model.backbone.square_pad = 1536 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + # list(range(0, 9)) + list(range(10, 19)) + list(range(20, 29)) + list(range(30, 39)) + # list(range(0, 13)) + list(range(14, 26)) + list(range(27, 39)) + # list(range(0, 19)) + list(range(20, 39)) + list(range(0, 40)) +) +model.backbone.net.residual_block_indexes = ( + list(range(3, 41, 4)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_75ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_75ep.py new file mode 100644 index 00000000..f5064519 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_75ep.py @@ -0,0 +1,43 @@ +from functools import partial + +from ..common.coco_loader_lsj_1280 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1280 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(0, 40)) +) +model.backbone.net.residual_block_indexes = ( + list(range(3, 41, 4)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn.py new file mode 100644 index 00000000..2b0ddcad --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_1280 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1280 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + 20000, train.max_iter-1 +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 500 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1280.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1280.py new file mode 100644 index 00000000..0e57b36d --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1280.py @@ -0,0 +1,46 @@ +from functools import partial + +from ..common.coco_loader_lsj_1280 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1280 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 500 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1408.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1408.py new file mode 100644 index 00000000..bb8d8e8c --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1408.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_1408 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1408 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1536.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1536.py new file mode 100644 index 00000000..35ecf527 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1536.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_1536 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1536 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + 14999, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1664.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1664.py new file mode 100644 index 00000000..7eaa0d8c --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1664.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_1664 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1664 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1792.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1792.py new file mode 100644 index 00000000..97b33a73 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1792.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_1792 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1792 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1920.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1920.py new file mode 100644 index 00000000..d2e0c03f --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_1920.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_1920 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1920 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2048.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2048.py new file mode 100644 index 00000000..e9355380 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2048.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_2048 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 2048 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2176.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2176.py new file mode 100644 index 00000000..633b8867 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2176.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_2176 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 2176 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2304.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2304.py new file mode 100644 index 00000000..5e5375ef --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2304.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_2304 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 2304 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2432.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2432.py new file mode 100644 index 00000000..7536acc8 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2432.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_2432 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 2432 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2560.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2560.py new file mode 100644 index 00000000..1c798eb3 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_1B_attn_2560.py @@ -0,0 +1,47 @@ +from functools import partial + +from ..common.coco_loader_lsj_2560 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/xinlongwang/models/mae/" \ + "mae_vit_giant_patch14_150ep_8x8gpu_in21k_70ep_bf16/checkpoint-149-s14tos16.pth" + +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 2560 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + train.max_iter-2, train.max_iter-1, +# milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 0 / train.max_iter # 2ep 118k*2/64 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..95823ef4 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,48 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_b_ours.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_b_ours.py new file mode 100644 index 00000000..ac933b68 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_b_ours.py @@ -0,0 +1,51 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + +# custom cfgs +model.backbone.net.beit_like_model = True diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep.py new file mode 100644 index 00000000..98ef51b0 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1024 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1024.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1024.py new file mode 100644 index 00000000..98ef51b0 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1024.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1024 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1280.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1280.py new file mode 100644 index 00000000..10643dec --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1280.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_1280 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1280 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1408.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1408.py new file mode 100644 index 00000000..0a5eb9a5 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1408.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_1408 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1408 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1536.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1536.py new file mode 100644 index 00000000..4af3ad94 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1536.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_1536 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1536 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1664.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1664.py new file mode 100644 index 00000000..02be29cd --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1664.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_1664 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1664 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1792.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1792.py new file mode 100644 index 00000000..5f73d8f2 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1792.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_1792 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1792 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1920.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1920.py new file mode 100644 index 00000000..65d52b75 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_1920.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_1920 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1920 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2048.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2048.py new file mode 100644 index 00000000..9b7821af --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2048.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_2048 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 2048 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2176.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2176.py new file mode 100644 index 00000000..e075eea0 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2176.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_2176 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 2176 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2304.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2304.py new file mode 100644 index 00000000..a65272d9 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2304.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_2304 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 2304 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2432.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2432.py new file mode 100644 index 00000000..ea731bcb --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2432.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_2432 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 2432 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2560.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2560.py new file mode 100644 index 00000000..6b0ad12b --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_2560.py @@ -0,0 +1,36 @@ +from functools import partial + +from ..common.coco_loader_lsj_2560 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( +# dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 2560 +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_conv.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_conv.py new file mode 100644 index 00000000..bbb0cfb6 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep_conv.py @@ -0,0 +1,39 @@ +from functools import partial + +from .cascade_mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/wwen-a/model_weights/mae/mae_pretrain_vit_huge_p14to16.pth" +# "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +model.backbone.net.use_act_checkpoint = True +# 7, 15, 23, 31 for global attention +# model.backbone.net.window_block_indexes = ( +# list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +# ) +model.backbone.net.window_block_indexes = ( + list(range(0, 32)) +) +model.backbone.net.residual_block_indexes = ( + list(range(3, 33, 4)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_l_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_l_100ep.py new file mode 100644 index 00000000..3ec259e0 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_l_100ep.py @@ -0,0 +1,23 @@ +from functools import partial + +from .cascade_mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth" + +model.backbone.net.embed_dim = 1024 +model.backbone.net.depth = 24 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.4 +# 5, 11, 17, 23 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..7206525f --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,38 @@ +from functools import partial +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from ..common.coco_loader_lsj import dataloader + + +model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" + + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Optimizer +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_h_75ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_h_75ep.py new file mode 100644 index 00000000..9fe752c6 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_h_75ep.py @@ -0,0 +1,31 @@ +from functools import partial + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.5 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep +lr_multiplier.scheduler.milestones = [ + milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_l_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_l_100ep.py new file mode 100644 index 00000000..933b84eb --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_l_100ep.py @@ -0,0 +1,23 @@ +from functools import partial + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth" + +model.backbone.net.embed_dim = 1024 +model.backbone.net.depth = 24 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.4 +# 5, 11, 17, 23 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn.py new file mode 100644 index 00000000..cab8b619 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn.py @@ -0,0 +1,70 @@ +from functools import partial + +from fvcore.common.param_scheduler import MultiStepParamScheduler +from detectron2.solver import WarmupParamScheduler +from detectron2.config import LazyCall as L + +from ..common.objects365_trainval_loader_lsj_1280 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + # lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/yxf/outputs/beitXclip/large-giant/150/merge30M_beit_g_patch14_224_sz224_mask105_lr1e-3_b20.98_eps1e-6_dpr0.1_ls0.0_bsz16x8x32_ep150_wmep2_cj0.0_ftpye2_ltype1_mixup0.0_abspos/checkpoint-149/mp_rank_00_model_states_renamed-s14tos16.pt" + +# for o365 +model.roi_heads.mask_in_features = None +model.roi_heads.mask_pooler = None +model.roi_heads.mask_head = None +model.roi_heads.num_classes = 365 + +# for model +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1280 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = 350057 # 25ep, (1742292+50000) * 25 / 128 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[350050, 350056], + num_updates=train.max_iter, + ), + warmup_length=3500 / train.max_iter, + warmup_factor=0.001, +) + +dataloader.train.total_batch_size = 128 +optimizer.lr = 1.5e-4 +model.backbone.net.beit_like_qkv_bias = True +model.backbone.net.beit_like_gamma = False +train.output_dir = "work_dirs/o365_cascade_mask_rcnn_vitdet_1B_bs64_1280_attn_16x8" +train.checkpointer.period = 1000 +train.eval_period = 5000 + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn_1024.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn_1024.py new file mode 100644 index 00000000..144547d1 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn_1024.py @@ -0,0 +1,70 @@ +from functools import partial + +from fvcore.common.param_scheduler import MultiStepParamScheduler +from detectron2.solver import WarmupParamScheduler +from detectron2.config import LazyCall as L + +from ..common.objects365_trainval_loader_lsj_1024 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + # lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/yxf/outputs/beitXclip/large-giant/150/merge30M_beit_g_patch14_224_sz224_mask105_lr1e-3_b20.98_eps1e-6_dpr0.1_ls0.0_bsz16x8x32_ep150_wmep2_cj0.0_ftpye2_ltype1_mixup0.0_abspos/checkpoint-149/mp_rank_00_model_states_renamed-s14tos16.pt" + +# for o365 +model.roi_heads.mask_in_features = None +model.roi_heads.mask_pooler = None +model.roi_heads.mask_head = None +model.roi_heads.num_classes = 365 + +# for model +model.backbone.net.img_size = 1024 # 1024 +model.backbone.square_pad = 1024 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = 350057 # 25ep, (1742292+50000) * 25 / 128 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[350050, 350056], + num_updates=train.max_iter, + ), + warmup_length=15000 / train.max_iter, + warmup_factor=0.001, +) + +dataloader.train.total_batch_size = 128 +optimizer.lr = 1e-4 +model.backbone.net.beit_like_qkv_bias = True +model.backbone.net.beit_like_gamma = False +train.output_dir = "work_dirs/o365_cascade_mask_rcnn_vitdet_1B_bs128_1024_attn_16x8" +train.checkpointer.period = 1000 +train.eval_period = 5000 + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn_1024to1280.py b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn_1024to1280.py new file mode 100644 index 00000000..ae608b64 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/COCO/o365_cascade_mask_rcnn_vitdet_1B_attn_1024to1280.py @@ -0,0 +1,70 @@ +from functools import partial + +from fvcore.common.param_scheduler import MultiStepParamScheduler +from detectron2.solver import WarmupParamScheduler +from detectron2.config import LazyCall as L + +from ..common.objects365_trainval_loader_lsj_1280 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + # lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "/sharefs/baaivision/yxf/outputs/beitXclip/large-giant/150/merge30M_beit_g_patch14_224_sz224_mask105_lr1e-3_b20.98_eps1e-6_dpr0.1_ls0.0_bsz16x8x32_ep150_wmep2_cj0.0_ftpye2_ltype1_mixup0.0_abspos/checkpoint-149/mp_rank_00_model_states_renamed-s14tos16.pt" + +# for o365 +model.roi_heads.mask_in_features = None +model.roi_heads.mask_pooler = None +model.roi_heads.mask_head = None +model.roi_heads.num_classes = 365 + +# for model +model.backbone.net.img_size = 1280 # 1024 +model.backbone.square_pad = 1280 # 1024 +model.backbone.net.patch_size = 16 # 14 --> 16 +model.backbone.net.window_size = 16 # 14 --> 16 +model.backbone.net.embed_dim = 1408 +model.backbone.net.depth = 40 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 6144 / 1408 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.6 # 0.5 --> 0.6 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + # list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) + list(range(32, 39)) + list(range(0, 3)) + list(range(4, 7)) + list(range(8, 11)) + list(range(12, 15)) + list(range(16, 19)) + + list(range(20, 23)) + list(range(24, 27)) + list(range(28, 31)) + list(range(32, 35)) + list(range(36, 39)) + # list(range(0, 40)) +) +# model.backbone.net.residual_block_indexes = ( +# list(range(3, 41, 4)) +# ) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=40) # 32 --> 40 +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = 350057 # 25ep, (1742292+50000) * 25 / 128 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[45000, 350056], + num_updates=train.max_iter, + ), + warmup_length=5000 / train.max_iter, + warmup_factor=0.001, +) + +dataloader.train.total_batch_size = 128 +optimizer.lr = 1e-4 +model.backbone.net.beit_like_qkv_bias = True +model.backbone.net.beit_like_gamma = False +train.output_dir = "work_dirs/o365_cascade_mask_rcnn_vitdet_1B_bs128_1024_attn_16x8" +train.checkpointer.period = 1000 +train.eval_period = 5000 + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py new file mode 100644 index 00000000..1cf9c3ea --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py @@ -0,0 +1,48 @@ +from functools import partial +import torch.nn as nn + +from detectron2.config import LazyCall as L +from detectron2.data.detection_utils import get_fed_loss_cls_weights +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator + +from ..COCO.cascade_mask_rcnn_mvitv2_b_in21k_100ep import ( + dataloader, + model, + train, + lr_multiplier, + optimizer, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + +model.roi_heads.num_classes = 1203 +for i in range(3): + model.roi_heads.box_predictors[i].test_score_thresh = 0.02 + model.roi_heads.box_predictors[i].test_topk_per_image = 300 + model.roi_heads.box_predictors[i].use_sigmoid_ce = True + model.roi_heads.box_predictors[i].use_fed_loss = True + model.roi_heads.box_predictors[i].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ) + +# Schedule +# 100 ep = 156250 iters * 64 images/iter / 100000 images/ep +train.max_iter = 156250 +train.eval_period = 30000 + +lr_multiplier.scheduler.milestones = [138889, 150463] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +optimizer.lr = 1e-4 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_h_in21k_50ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_h_in21k_50ep.py new file mode 100644 index 00000000..084444bf --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_h_in21k_50ep.py @@ -0,0 +1,25 @@ +from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, +) + +model.backbone.bottom_up.embed_dim = 192 +model.backbone.bottom_up.depth = 80 +model.backbone.bottom_up.num_heads = 3 +model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79) +model.backbone.bottom_up.drop_path_rate = 0.6 +model.backbone.bottom_up.use_act_checkpoint = True + +train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth" + +train.max_iter = train.max_iter // 2 # 100ep -> 50ep +lr_multiplier.scheduler.milestones = [ + milestone // 2 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +optimizer.lr = 2e-5 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py new file mode 100644 index 00000000..779442c6 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py @@ -0,0 +1,24 @@ +from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, +) + +model.backbone.bottom_up.embed_dim = 144 +model.backbone.bottom_up.depth = 48 +model.backbone.bottom_up.num_heads = 2 +model.backbone.bottom_up.last_block_indexes = (1, 7, 43, 47) +model.backbone.bottom_up.drop_path_rate = 0.5 + +train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_L_in21k.pyth" + +train.max_iter = train.max_iter // 2 # 100ep -> 50ep +lr_multiplier.scheduler.milestones = [ + milestone // 2 for milestone in lr_multiplier.scheduler.milestones +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +optimizer.lr = 4e-5 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_swin_b_in21k_50ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_swin_b_in21k_50ep.py new file mode 100644 index 00000000..d18c925f --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_swin_b_in21k_50ep.py @@ -0,0 +1,49 @@ +from detectron2.config.lazy import LazyCall as L +from detectron2.data.detection_utils import get_fed_loss_cls_weights +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator + +from ..COCO.cascade_mask_rcnn_swin_b_in21k_50ep import ( + dataloader, + model, + train, + lr_multiplier, + optimizer, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + +model.backbone.bottom_up.drop_path_rate = 0.3 + +model.roi_heads.num_classes = 1203 +for i in range(3): + model.roi_heads.box_predictors[i].test_score_thresh = 0.02 + model.roi_heads.box_predictors[i].test_topk_per_image = 300 + model.roi_heads.box_predictors[i].use_sigmoid_ce = True + model.roi_heads.box_predictors[i].use_fed_loss = True + model.roi_heads.box_predictors[i].get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ) + +# Schedule +# 100 ep = 156250 iters * 64 images/iter / 100000 images/ep +# 100 ep -> 50 ep as the model achieves better performance with 50 epochs +train.max_iter = 156250 // 2 +train.eval_period = 30000 + +lr_multiplier.scheduler.milestones = [milestone // 2 for milestone in [138889, 150463]] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +# Optimized hyperparams +optimizer.lr = 1e-4 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_swin_l_in21k_50ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_swin_l_in21k_50ep.py new file mode 100644 index 00000000..9e22e3b2 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_swin_l_in21k_50ep.py @@ -0,0 +1,12 @@ +from .cascade_mask_rcnn_swin_b_in21k_50ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, +) + +model.backbone.bottom_up.embed_dim = 192 +model.backbone.bottom_up.num_heads = [6, 12, 24, 48] + +train.init_checkpoint = "detectron2://ImageNetPretrained/swin/swin_large_patch4_window7_224_22k.pth" diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn.py new file mode 100644 index 00000000..2ea5c5e8 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn.py @@ -0,0 +1,84 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from ..COCO.cascade_mask_rcnn_vitdet_1B_attn import ( + dataloader, + model, + train, + lr_multiplier, + optimizer, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + +# model.roi_heads.num_classes = 1203 +# model.roi_heads.box_predictor.test_score_thresh = 0.02 +# model.roi_heads.box_predictor.test_topk_per_image = 300 +# model.roi_heads.box_predictor.use_sigmoid_ce = True +# model.roi_heads.box_predictor.use_fed_loss = True +# model.roi_heads.box_predictor.get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( +# dataloader.train.dataset.names, 0.5 +# ) +# [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + +# Schedule +# 100 ep = 156250 iters * 64 images/iter / 100000 images/ep +train.max_iter = 156250 +train.eval_period = 30000 + +lr_multiplier.scheduler.milestones = [138889, 150463] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +optimizer.lr = 2e-4 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn.py.bak b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn.py.bak new file mode 100644 index 00000000..6f8a5384 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn.py.bak @@ -0,0 +1,84 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from ..COCO.cascade_mask_rcnn_vitdet_1B_attn import ( + dataloader, + model, + train, + lr_multiplier, + optimizer, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + +# model.roi_heads.num_classes = 1203 +# model.roi_heads.box_predictor.test_score_thresh = 0.02 +# model.roi_heads.box_predictor.test_topk_per_image = 300 +# model.roi_heads.box_predictor.use_sigmoid_ce = True +# model.roi_heads.box_predictor.use_fed_loss = True +# model.roi_heads.box_predictor.get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( +# dataloader.train.dataset.names, 0.5 +# ) +# [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + +# Schedule +# 100 ep = 156250 iters * 64 images/iter / 100000 images/ep +train.max_iter = 156250 +train.eval_period = 5000 + +lr_multiplier.scheduler.milestones = [50000, 150463] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +optimizer.lr = 2e-4 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn_1536.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn_1536.py new file mode 100644 index 00000000..0520dbee --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_1B_attn_1536.py @@ -0,0 +1,84 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from ..COCO.cascade_mask_rcnn_vitdet_1B_attn_1536 import ( + dataloader, + model, + train, + lr_multiplier, + optimizer, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + +# model.roi_heads.num_classes = 1203 +# model.roi_heads.box_predictor.test_score_thresh = 0.02 +# model.roi_heads.box_predictor.test_topk_per_image = 300 +# model.roi_heads.box_predictor.use_sigmoid_ce = True +# model.roi_heads.box_predictor.use_fed_loss = True +# model.roi_heads.box_predictor.get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( +# dataloader.train.dataset.names, 0.5 +# ) +# [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.001, + test_topk_per_image=1000, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + +# Schedule +# 100 ep = 156250 iters * 64 images/iter / 100000 images/ep +train.max_iter = 156250 +train.eval_period = 5000 + +lr_multiplier.scheduler.milestones = [50000, 150463] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +optimizer.lr = 2e-4 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..8115224c --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,51 @@ +from detectron2.config import LazyCall as L +from detectron2.data.detection_utils import get_fed_loss_cls_weights +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + optimizer, + train, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_h_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_h_100ep.py new file mode 100644 index 00000000..68bec573 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_h_100ep.py @@ -0,0 +1,51 @@ +from detectron2.config import LazyCall as L +from detectron2.data.detection_utils import get_fed_loss_cls_weights +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from .mask_rcnn_vitdet_h_100ep import ( + dataloader, + lr_multiplier, + model, + optimizer, + train, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_l_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_l_100ep.py new file mode 100644 index 00000000..ebaf526a --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/cascade_mask_rcnn_vitdet_l_100ep.py @@ -0,0 +1,51 @@ +from detectron2.config import LazyCall as L +from detectron2.data.detection_utils import get_fed_loss_cls_weights +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from .mask_rcnn_vitdet_l_100ep import ( + dataloader, + lr_multiplier, + model, + optimizer, + train, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..ef905457 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,44 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from ..COCO.mask_rcnn_vitdet_b_100ep import ( + dataloader, + model, + train, + lr_multiplier, + optimizer, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + +model.roi_heads.num_classes = 1203 +model.roi_heads.box_predictor.test_score_thresh = 0.02 +model.roi_heads.box_predictor.test_topk_per_image = 300 +model.roi_heads.box_predictor.use_sigmoid_ce = True +model.roi_heads.box_predictor.use_fed_loss = True +model.roi_heads.box_predictor.get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 +) + +# Schedule +# 100 ep = 156250 iters * 64 images/iter / 100000 images/ep +train.max_iter = 156250 +train.eval_period = 30000 + +lr_multiplier.scheduler.milestones = [138889, 150463] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +optimizer.lr = 2e-4 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_h_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_h_100ep.py new file mode 100644 index 00000000..fd82ff97 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_h_100ep.py @@ -0,0 +1,28 @@ +from functools import partial + +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, +) + +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" + +model.backbone.net.embed_dim = 1280 +model.backbone.net.depth = 32 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.4 +# 7, 15, 23, 31 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) +) + + +optimizer.lr = 1e-4 +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_l_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_l_100ep.py new file mode 100644 index 00000000..de0981ac --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/LVIS/mask_rcnn_vitdet_l_100ep.py @@ -0,0 +1,24 @@ +from functools import partial + +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, +) + +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth" + +model.backbone.net.embed_dim = 1024 +model.backbone.net.depth = 24 +model.backbone.net.num_heads = 16 +model.backbone.net.drop_path_rate = 0.4 +# 5, 11, 17, 23 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23)) +) + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/Objects365/mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/Objects365/mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..aa296010 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/Objects365/mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,41 @@ +from functools import partial +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from ..common.objects365_loader_lsj import dataloader + +model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model +model.roi_heads.mask_in_features = None +model.roi_heads.mask_pooler = None +model.roi_heads.mask_head = None +model.roi_heads.num_classes = 365 + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" + + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 # todo + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Optimizer +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj.py new file mode 100644 index 00000000..e6c2f1e9 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1024 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1024.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1024.py new file mode 100644 index 00000000..0892d126 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1024.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1024 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1280.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1280.py new file mode 100644 index 00000000..39fd57c2 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1280.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1280 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1408.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1408.py new file mode 100644 index 00000000..ccfcc5f5 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1408.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1408 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1536.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1536.py new file mode 100644 index 00000000..65155b9a --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1536.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1536 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1664.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1664.py new file mode 100644 index 00000000..57d1de41 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1664.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1664 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1792.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1792.py new file mode 100644 index 00000000..6b3d195c --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1792.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1792 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1920.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1920.py new file mode 100644 index 00000000..66d2e548 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_1920.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1920 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2048.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2048.py new file mode 100644 index 00000000..9dd356f5 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2048.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 2048 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2176.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2176.py new file mode 100644 index 00000000..38cb1d43 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2176.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 2176 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2304.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2304.py new file mode 100644 index 00000000..1e6f7fe5 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2304.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 2304 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2432.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2432.py new file mode 100644 index 00000000..67eb140f --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2432.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 2432 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2560.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2560.py new file mode 100644 index 00000000..0ed7bb50 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/coco_loader_lsj_2560.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 2560 +dataloader = model_zoo.get_config("common/data/coco.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping +dataloader.train.mapper.recompute_boxes = True + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_loader_lsj.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_loader_lsj.py new file mode 100644 index 00000000..c68fcfa1 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_loader_lsj.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1024 +dataloader = model_zoo.get_config("common/data/objects365.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping todo: how to maintain right bbox anno with cropping ? +dataloader.train.mapper.recompute_boxes = False + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_loader_lsj_1280.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_loader_lsj_1280.py new file mode 100644 index 00000000..810f17af --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_loader_lsj_1280.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1280 +dataloader = model_zoo.get_config("common/data/objects365.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping todo: how to maintain right bbox anno with cropping ? +dataloader.train.mapper.recompute_boxes = False + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1024.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1024.py new file mode 100644 index 00000000..7ba8bee4 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1024.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1024 +dataloader = model_zoo.get_config("common/data/objects365_trainval.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping todo: how to maintain right bbox anno with cropping ? +dataloader.train.mapper.recompute_boxes = False + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1280.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1280.py new file mode 100644 index 00000000..9952ec3c --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1280.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1280 +dataloader = model_zoo.get_config("common/data/objects365_trainval.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping todo: how to maintain right bbox anno with cropping ? +dataloader.train.mapper.recompute_boxes = False + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1536.py b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1536.py new file mode 100644 index 00000000..74edebde --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/common/objects365_trainval_loader_lsj_1536.py @@ -0,0 +1,22 @@ +import detectron2.data.transforms as T +from detectron2 import model_zoo +from detectron2.config import LazyCall as L + +# Data using LSJ +image_size = 1536 +dataloader = model_zoo.get_config("common/data/objects365_trainval.py").dataloader +dataloader.train.mapper.augmentations = [ + L(T.RandomFlip)(horizontal=True), # flip first + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), + L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), +] +dataloader.train.mapper.image_format = "RGB" +dataloader.train.total_batch_size = 64 +# recompute boxes due to cropping todo: how to maintain right bbox anno with cropping ? +dataloader.train.mapper.recompute_boxes = False + +dataloader.test.mapper.augmentations = [ + L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), +] diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..95823ef4 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,48 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cb_cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cb_cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..1afb2489 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cb_cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,48 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from .cb_mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cb_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cb_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..a6bbb043 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/cb_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,38 @@ +from functools import partial +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from ..common.coco_loader_lsj import dataloader + + +model = model_zoo.get_config("common/models/cb_mask_rcnn_vitdet.py").model + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" + + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Optimizer +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_b_6attn_win32_1536_lrd0p7.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_b_6attn_win32_1536_lrd0p7.py new file mode 100644 index 00000000..a0625968 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_b_6attn_win32_1536_lrd0p7.py @@ -0,0 +1,45 @@ +from functools import partial + +from ..common.coco_loader_lsj_1536 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "" + +model.backbone.net.img_size = 1536 +model.backbone.square_pad = 1536 +model.backbone.net.patch_size = 16 +model.backbone.net.window_size = 32 +model.backbone.net.embed_dim = 768 +model.backbone.net.depth = 12 +model.backbone.net.num_heads = 12 +model.backbone.net.mlp_ratio = 4*2/3 +model.backbone.net.use_act_checkpoint = False +model.backbone.net.drop_path_rate = 0.1 + + +# 1, 3, 5, 7, 9, 11 for global attention +model.backbone.net.window_block_indexes = [0, 2, 4, 6, 8, 10] + +optimizer.lr=5e-5 +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.7, num_layers=12) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + + +train.max_iter = 60000 +lr_multiplier.scheduler.milestones = [ + train.max_iter*8//10, train.max_iter*9//10 +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=128 + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py new file mode 100644 index 00000000..8b2ca7b9 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py @@ -0,0 +1,46 @@ +from functools import partial + +from ..common.coco_loader_lsj_1024 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "" + +model.backbone.net.img_size = 1024 +model.backbone.square_pad = 1024 +model.backbone.net.patch_size = 16 +model.backbone.net.window_size = 16 +model.backbone.net.embed_dim = 1024 +model.backbone.net.depth = 24 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 4*2/3 +model.backbone.net.use_act_checkpoint = False +model.backbone.net.drop_path_rate = 0.4 + +# 5, 11, 17, 23 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23)) +) + +optimizer.lr=6e-5 +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + + +train.max_iter = 60000 +lr_multiplier.scheduler.milestones = [ + train.max_iter*8//10, train.max_iter*9//10 +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=144 + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py new file mode 100644 index 00000000..b0c0a345 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/eva2_coco_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py @@ -0,0 +1,46 @@ +from functools import partial + +from ..common.coco_loader_lsj_1536 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + # dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "" + +model.backbone.net.img_size = 1536 +model.backbone.square_pad = 1536 +model.backbone.net.patch_size = 16 +model.backbone.net.window_size = 32 +model.backbone.net.embed_dim = 1024 +model.backbone.net.depth = 24 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 4*2/3 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.4 + +# 2, 5, 8, 11, 14, 17, 20, 23 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23)) +) + +optimizer.lr=5e-5 +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + + +train.max_iter = 60000 +lr_multiplier.scheduler.milestones = [ + train.max_iter*8//10, train.max_iter*9//10 +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=128 + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..7206525f --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_coco/mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,38 @@ +from functools import partial +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from ..common.coco_loader_lsj import dataloader + + +model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" + + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Optimizer +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..95823ef4 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,48 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_b_4attn_1024_lrd0p7.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_b_4attn_1024_lrd0p7.py new file mode 100644 index 00000000..9a453964 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_b_4attn_1024_lrd0p7.py @@ -0,0 +1,80 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from ..eva2_mim_to_coco.eva2_coco_cascade_mask_rcnn_vitdet_b_4attn_1024_lrd0p7 import ( + dataloader, + lr_multiplier, + model, + optimizer, + train, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + + +optimizer.lr=1e-4 + +train.max_iter = 50000 +lr_multiplier.scheduler.milestones = [ + train.max_iter*8//10, train.max_iter*9//10 +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=128 + +train.eval_period=2500 +train.checkpointer.period=2500 \ No newline at end of file diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py new file mode 100644 index 00000000..fcc204a2 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8.py @@ -0,0 +1,79 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from ..eva2_mim_to_coco.eva2_coco_cascade_mask_rcnn_vitdet_l_4attn_1024_lrd0p8 import ( + dataloader, + lr_multiplier, + model, + optimizer, + train, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + +optimizer.lr=1e-4 + +train.max_iter = 40000 +lr_multiplier.scheduler.milestones = [ + train.max_iter*8//10, train.max_iter*9//10 +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=128 + +train.eval_period=2500 +train.checkpointer.period=2500 \ No newline at end of file diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py new file mode 100644 index 00000000..8de575a5 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/eva2_lvis_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8.py @@ -0,0 +1,79 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from ..eva2_mim_to_coco.eva2_coco_cascade_mask_rcnn_vitdet_l_8attn_win32_1536_lrd0p8 import ( + dataloader, + lr_multiplier, + model, + optimizer, + train, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + +optimizer.lr=1e-4 + +train.max_iter = 40000 +lr_multiplier.scheduler.milestones = [ + train.max_iter*8//10, train.max_iter*9//10 +] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 1000 / train.max_iter + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=128 + +train.eval_period=2500 +train.checkpointer.period=2500 \ No newline at end of file diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..7206525f --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_mim_to_lvis/mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,38 @@ +from functools import partial +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from ..common.coco_loader_lsj import dataloader + + +model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" + + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Optimizer +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..95823ef4 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,48 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/eva2_o365_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/eva2_o365_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py new file mode 100644 index 00000000..14bf59a9 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/eva2_o365_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py @@ -0,0 +1,62 @@ +from functools import partial + +from fvcore.common.param_scheduler import MultiStepParamScheduler +from detectron2.solver import WarmupParamScheduler +from detectron2.config import LazyCall as L + +from ..common.objects365_trainval_loader_lsj_1536 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +train.init_checkpoint = "" + +# for o365 +model.roi_heads.mask_in_features = None +model.roi_heads.mask_pooler = None +model.roi_heads.mask_head = None +model.roi_heads.num_classes = 365 + +# for model +model.backbone.net.img_size = 1536 +model.backbone.square_pad = 1536 +model.backbone.net.patch_size = 16 +model.backbone.net.window_size = 16 +model.backbone.net.embed_dim = 1024 +model.backbone.net.depth = 24 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 4*2/3 +model.backbone.net.use_act_checkpoint = False +model.backbone.net.drop_path_rate = 0.4 + +# 2, 5, 8, 11, 14, 17, 20, 23 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23)) +) + +optimizer.lr=6e-5 +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = 400000 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[train.max_iter*8//10, train.max_iter*9//10], + num_updates=train.max_iter, + ), + warmup_length=5000 / train.max_iter, + warmup_factor=0.001, +) + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=160 + +train.checkpointer.period = 2500 +train.eval_period = 10000 + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..7206525f --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365/mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,38 @@ +from functools import partial +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from ..common.coco_loader_lsj import dataloader + + +model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" + + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Optimizer +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..95823ef4 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,48 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..1afb2489 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,48 @@ +from detectron2.config import LazyCall as L +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import ( + FastRCNNOutputLayers, + FastRCNNConvFCHead, + CascadeROIHeads, +) + +from .cb_mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + test_score_thresh=0.05, + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + cls_agnostic_bbox_reg=True, + num_classes="${...num_classes}", + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py new file mode 100644 index 00000000..6cb47757 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py @@ -0,0 +1,91 @@ +from functools import partial + +from ..common.coco_loader_lsj_1536 import dataloader +from .cb_cascade_mask_rcnn_vitdet_b_100ep import ( + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +from detectron2.config import LazyCall as L +from fvcore.common.param_scheduler import * +from detectron2.solver import WarmupParamScheduler + + + +model.backbone.net.img_size = 1536 +model.backbone.square_pad = 1536 +model.backbone.net.patch_size = 16 +model.backbone.net.window_size = 16 +model.backbone.net.embed_dim = 1024 +model.backbone.net.depth = 24 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 4*2/3 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.3 + +model.backbone.net.cb_out_index = [2, 5, 20, 23] +model.backbone.net.del_patch_embed = False + +# 2, 5, 8, 11, 14, 17, 20, 23 for global attention +# 2 2 18 2 swin-L +# 2 5 20 23 +model.backbone.net.window_block_indexes = ( + list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23)) +) + +model.backbone.cb_net.img_size = 1536 +model.backbone.cb_net.patch_size = 16 +model.backbone.cb_net.window_size = 16 +model.backbone.cb_net.embed_dim = 1024 +model.backbone.cb_net.depth = 24 +model.backbone.cb_net.num_heads = 16 +model.backbone.cb_net.mlp_ratio = 4*2/3 +model.backbone.cb_net.use_act_checkpoint = True +model.backbone.cb_net.drop_path_rate = 0.3 + +model.backbone.cb_net.cb_out_index = [2, 5, 20, 23] +model.backbone.cb_net.del_patch_embed = True + +# 2, 5, 8, 11, 14, 17, 20, 23 for global attention +# 2 2 18 2 swin-L +# 2 5 20 23 +model.backbone.cb_net.window_block_indexes = ( + list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23)) +) + + +optimizer.lr=4e-5 + +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = 40000 + +train.model_ema.enabled=True +train.model_ema.device="cuda" +train.model_ema.decay=0.9999 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(CosineParamScheduler)( + start_value=1, + end_value=1, + ), + warmup_length=0.01, + warmup_factor=0.001, +) + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=64 + +dataloader.test.dataset.names = "coco_2017_test-dev" +dataloader.evaluator.output_dir = './cb_output_eva_trainval_results' + +train.checkpointer.period=1000 +train.checkpointer.max_to_keep=10 # options for PeriodicCheckpointer +train.eval_period=40000 + + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..a6bbb043 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,38 @@ +from functools import partial +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from ..common.coco_loader_lsj import dataloader + + +model = model_zoo.get_config("common/models/cb_mask_rcnn_vitdet.py").model + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" + + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Optimizer +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py new file mode 100644 index 00000000..98cabbfc --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py @@ -0,0 +1,56 @@ +from functools import partial + +from ..common.coco_loader_lsj_1536 import dataloader +from .cascade_mask_rcnn_vitdet_b_100ep import ( + lr_multiplier, + model, + train, + optimizer, + get_vit_lr_decay_rate, +) + +from detectron2.config import LazyCall as L +from fvcore.common.param_scheduler import * +from detectron2.solver import WarmupParamScheduler + + + +model.backbone.net.img_size = 1536 +model.backbone.square_pad = 1536 +model.backbone.net.patch_size = 16 +model.backbone.net.window_size = 16 +model.backbone.net.embed_dim = 1024 +model.backbone.net.depth = 24 +model.backbone.net.num_heads = 16 +model.backbone.net.mlp_ratio = 4*2/3 +model.backbone.net.use_act_checkpoint = True +model.backbone.net.drop_path_rate = 0.3 + +# 2, 5, 8, 11, 14, 17, 20, 23 for global attention +model.backbone.net.window_block_indexes = ( + list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23)) +) + +optimizer.lr=4e-5 +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24) +optimizer.params.overrides = {} +optimizer.params.weight_decay_norm = None + +train.max_iter = 40000 + +train.model_ema.enabled=True +train.model_ema.device="cuda" +train.model_ema.decay=0.9999 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(CosineParamScheduler)( + start_value=1, + end_value=1, + ), + warmup_length=0.01, + warmup_factor=0.001, +) + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=64 + diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..7206525f --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,38 @@ +from functools import partial +from fvcore.common.param_scheduler import MultiStepParamScheduler + +from detectron2 import model_zoo +from detectron2.config import LazyCall as L +from detectron2.solver import WarmupParamScheduler +from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate + +from ..common.coco_loader_lsj import dataloader + + +model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model + +# Initialization and trainer settings +train = model_zoo.get_config("common/train.py").train +train.amp.enabled = True +train.ddp.fp16_compression = True +train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" + + +# Schedule +# 100 ep = 184375 iters * 64 images/iter / 118000 images/ep +train.max_iter = 184375 + +lr_multiplier = L(WarmupParamScheduler)( + scheduler=L(MultiStepParamScheduler)( + values=[1.0, 0.1, 0.01], + milestones=[163889, 177546], + num_updates=train.max_iter, + ), + warmup_length=250 / train.max_iter, + warmup_factor=0.001, +) + +# Optimizer +optimizer = model_zoo.get_config("common/optim.py").AdamW +optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) +optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/cascade_mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/cascade_mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..8115224c --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/cascade_mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,51 @@ +from detectron2.config import LazyCall as L +from detectron2.data.detection_utils import get_fed_loss_cls_weights +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from .mask_rcnn_vitdet_b_100ep import ( + dataloader, + lr_multiplier, + model, + optimizer, + train, +) + +# arguments that don't exist for Cascade R-CNN +[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/eva2_o365_to_lvis_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/eva2_o365_to_lvis_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py new file mode 100644 index 00000000..ee813efd --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/eva2_o365_to_lvis_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py @@ -0,0 +1,71 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.matcher import Matcher +from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads + +from ..eva2_o365_to_coco.eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8 import ( + dataloader, + model, + train, + lr_multiplier, + optimizer, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + + +model.roi_heads.update( + _target_=CascadeROIHeads, + num_classes=1203, + box_heads=[ + L(FastRCNNConvFCHead)( + input_shape=ShapeSpec(channels=256, height=7, width=7), + conv_dims=[256, 256, 256, 256], + fc_dims=[1024], + conv_norm="LN", + ) + for _ in range(3) + ], + box_predictors=[ + L(FastRCNNOutputLayers)( + input_shape=ShapeSpec(channels=1024), + box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), + num_classes="${...num_classes}", + test_score_thresh=0.02, + test_topk_per_image=300, + cls_agnostic_bbox_reg=True, + use_sigmoid_ce=True, + use_fed_loss=True, + get_fed_loss_cls_weights=lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 + ), + ) + for (w1, w2) in [(10, 5), (20, 10), (30, 15)] + ], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) + for th in [0.5, 0.6, 0.7] + ], +) + +dataloader.test.num_workers=0 +dataloader.train.total_batch_size=64 + +train.max_iter = 70000 +train.eval_period=5000 +train.checkpointer.period=5000 diff --git a/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/mask_rcnn_vitdet_b_100ep.py b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/mask_rcnn_vitdet_b_100ep.py new file mode 100644 index 00000000..ef905457 --- /dev/null +++ b/EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_lvis/mask_rcnn_vitdet_b_100ep.py @@ -0,0 +1,44 @@ +from detectron2.config import LazyCall as L +from detectron2.data.samplers import RepeatFactorTrainingSampler +from detectron2.evaluation.lvis_evaluation import LVISEvaluator +from detectron2.data.detection_utils import get_fed_loss_cls_weights + +from ..COCO.mask_rcnn_vitdet_b_100ep import ( + dataloader, + model, + train, + lr_multiplier, + optimizer, +) + +dataloader.train.dataset.names = "lvis_v1_train" +dataloader.train.sampler = L(RepeatFactorTrainingSampler)( + repeat_factors=L(RepeatFactorTrainingSampler.repeat_factors_from_category_frequency)( + dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001 + ) +) +dataloader.test.dataset.names = "lvis_v1_val" +dataloader.evaluator = L(LVISEvaluator)( + dataset_name="${..test.dataset.names}", + max_dets_per_image=300, +) + +model.roi_heads.num_classes = 1203 +model.roi_heads.box_predictor.test_score_thresh = 0.02 +model.roi_heads.box_predictor.test_topk_per_image = 300 +model.roi_heads.box_predictor.use_sigmoid_ce = True +model.roi_heads.box_predictor.use_fed_loss = True +model.roi_heads.box_predictor.get_fed_loss_cls_weights = lambda: get_fed_loss_cls_weights( + dataloader.train.dataset.names, 0.5 +) + +# Schedule +# 100 ep = 156250 iters * 64 images/iter / 100000 images/ep +train.max_iter = 156250 +train.eval_period = 30000 + +lr_multiplier.scheduler.milestones = [138889, 150463] +lr_multiplier.scheduler.num_updates = train.max_iter +lr_multiplier.warmup_length = 250 / train.max_iter + +optimizer.lr = 2e-4 diff --git a/EVA/README.md b/EVA/README.md new file mode 100644 index 00000000..a87f4d64 --- /dev/null +++ b/EVA/README.md @@ -0,0 +1,75 @@ +# CBNet: A Composite Backbone Network Architecture for Object Detection + +### EVA02 (1536x1536) +| Backbone | Lr Schd | mask mAP (test-dev) | #params | config | model | +| :--------: | :-----: | :-----------------: | :-----: | :----------------------------------------------------------: | :-----------------------------------------------: | +| DB-EVA02-L | 1x | 56.1 | 674M | [config](EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py) | [HF](https://huggingface.co/weeewe/CBNetV2-EVA02) | + +- **Pre-trained models of EVA02 can be downloaded from [EVA02 pretrain](https://github.com/baaivision/EVA/tree/master/EVA-02/det)**. + +## Usage + +Please refer to [EVA](https://github.com/baaivision/EVA/tree/master/EVA-02/) for code clone, installation, and dataset preparation. +Then, replace or add the files from EVA-02 in this CBNet repository to original EVA repository. + +Download pretraining weight (eva02_L_m38m_to_o365.pth) from [EVA02 pretrain](https://github.com/baaivision/EVA/tree/master/EVA-02/det). +Then, run +``` +python get_cb_ckpt.py +``` +to create CBNet pretraining weight (cb_eva02_L_m38m_to_o365.pth). + +### Training + +To train CBNet-EVA-L with pre-trained models, run: +``` +# multi-gpu training +python tools/lazyconfig_train_net.py \ + --num-gpus N_GPU \ + --config-file projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py \ + train.output_dir=YOUR_OUTPUT_PATH \ + train.init_checkpoint=PATH_TO/cb_eva02_L_m38m_to_o365.pth + +### Inference on Test + +To train CBNet-EVA-L with pre-trained models, run: +python tools/lazyconfig_train_net.py \ + --num-gpus N_GPU \ + --config-file projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py \ + --eval-only \ + train.model_ema.use_ema_weights_for_eval_only=True \ + model.roi_heads.use_soft_nms=True \ + model.roi_heads.class_wise=True \ + model.roi_heads.method=linear \ + model.roi_heads.iou_threshold=0.5 \ + model.roi_heads.override_score_thresh=0.0 \ + model.roi_heads.maskness_thresh=0.5 \ + train.init_checkpoint=YOUR_OUTPUT_PATH/model_final.pth \ + dataloader.evaluator.output_dir=YOUR_OUTPUT_PATH +``` + + +Another example, to train a Mask R-CNN model with a `Duel-Swin-T` backbone and 8 gpus, run: +``` +tools/dist_train.sh configs/cbnet/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.py 8 --cfg-options model.pretrained= +``` + +## Citation +If you use our code/model, please consider to cite our paper [CBNet: A Composite Backbone Network Architecture for Object Detection](http://arxiv.org/abs/2107.00420). +``` +@ARTICLE{9932281, + author={Liang, Tingting and Chu, Xiaojie and Liu, Yudong and Wang, Yongtao and Tang, Zhi and Chu, Wei and Chen, Jingdong and Ling, Haibin}, + journal={IEEE Transactions on Image Processing}, + title={CBNet: A Composite Backbone Network Architecture for Object Detection}, + year={2022}, + volume={31}, + pages={6893-6906}, + doi={10.1109/TIP.2022.3216771}} +``` + +## License +The project is only free for academic research purposes, but needs authorization for commerce. For commerce permission, please contact wyt@pku.edu.cn. + + +## Other Links +> **Original CBNet**: See [CBNet: A Novel Composite Backbone Network Architecture for Object Detection](https://github.com/VDIGPKU/CBNet). \ No newline at end of file diff --git a/EVA/get_cb_ckpt.py b/EVA/get_cb_ckpt.py new file mode 100644 index 00000000..1f0b2933 --- /dev/null +++ b/EVA/get_cb_ckpt.py @@ -0,0 +1,15 @@ +import torch + +ckpt=torch.load('eva02_B_pt_in21k_p14to16.pt',map_location=torch.device("cpu")) + +model = ckpt['model'] + +keys = list(model.keys()) + +for k in keys: + model['backbone.cb_net.' + k] = model[k] + model['backbone.net.' + k] = model[k] + del model[k] + +ckpt['model'] = model +torch.save(ckpt, 'cb_eva02_B_pt_in21k_p14to16.pth') diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 5b860081..619153e2 --- a/README.md +++ b/README.md @@ -1,120 +1,124 @@ -# CBNet: A Composite Backbone Network Architecture for Object Detection -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/cbnetv2-a-composite-backbone-network/object-detection-on-coco)](https://paperswithcode.com/sota/object-detection-on-coco?p=cbnetv2-a-composite-backbone-network) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/cbnetv2-a-composite-backbone-network/instance-segmentation-on-coco)](https://paperswithcode.com/sota/instance-segmentation-on-coco?p=cbnetv2-a-composite-backbone-network) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/cbnetv2-a-composite-backbone-network/object-detection-on-coco-minival)](https://paperswithcode.com/sota/object-detection-on-coco-minival?p=cbnetv2-a-composite-backbone-network) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/cbnetv2-a-composite-backbone-network/instance-segmentation-on-coco-minival)](https://paperswithcode.com/sota/instance-segmentation-on-coco-minival?p=cbnetv2-a-composite-backbone-network) - -By [Tingting Liang](https://github.com/tingtingliangvs)\*, [Xiaojie Chu](https://github.com/chuxiaojie)\*, [Yudong Liu](https://github.com/PKUbahuangliuhe)\*, Yongtao Wang, Zhi Tang, Wei Chu, Jingdong Chen, Haibin Ling. - -This repo is the official implementation of [CBNetV2](http://arxiv.org/abs/2107.00420). It is based on [mmdetection](https://github.com/open-mmlab/mmdetection) and [Swin Transformer for Object Detection](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection). - -Contact us with tingtingliang@pku.edu.cn, chuxiaojie@stu.pku.edu.cn, wyt@pku.edu.cn. -## Introduction -*CBNetV2* achieves strong single-model performance on COCO object detection (`60.1 box AP` and `52.3 mask AP` on test-dev) without extra training data. - -![teaser](figures/cbnetv2.png) - - -## Partial Results and Models -**More results and models can be found in [model zoo](model_zoo.md)** - -### Faster R-CNN -| Backbone | Lr Schd | box mAP (minival) | #params | FLOPs | config | log | model | -| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | -| DB-ResNet50 | 1x | 40.8 | 69M | 284G | [config](configs/cbnet/faster_rcnn_cbv2d1_r50_fpn_1x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/faster_rcnn_cbv2d1_r50_fpn_1x_coco.log.json)| [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/faster_rcnn_cbv2d1_r50_fpn_1x_coco.pth.zip)| - - -### Mask R-CNN - -| Backbone | Lr Schd | box mAP (minival) | mask mAP (minival) | #params | FLOPs | config | log | model | -| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | -| DB-Swin-T | 3x | 50.2 | 44.5 | 76M | 357G | [config](configs/cbnet/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.log.json) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.pth.zip) | - -### Cascade Mask R-CNN (1600x1400) -| Backbone | Lr Schd | box mAP (minival/test-dev)| mask mAP (minival/test-dev)| #params | FLOPs | config | model | -| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | -| DB-Swin-S | 3x | 56.3/56.9 | 48.6/49.1 | 156M | 1016G | [config](configs/cbnet/cascade_mask_rcnn_cbv2_swin_small_patch4_window7_mstrain_400-1400_adamw_3x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/cascade_mask_rcnn_cbv2_swin_small_patch4_window7_mstrain_400-1400_adamw_3x_coco.pth.zip)| - -### Improved HTC (1600x1400) -*We use ImageNet-22k pretrained checkpoints of Swin-B and Swin-L. Compared to regular HTC, our HTC uses 4conv1fc in bbox head.* -| Backbone | Lr Schd | box mAP (minival/test-dev) | mask mAP (minival/test-dev) | #params | FLOPs | config | model | -| :---: |:---: | :---: | :---: | :---: | :---: | :---: | :---: | -| DB-Swin-B | 20e | 58.4/58.7 | 50.7/51.1 | 235M | 1348G | [config](configs/cbnet/htc_cbv2_swin_base_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_20e_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/htc_cbv2_swin_base22k_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_20e_coco.pth.zip) | -| DB-Swin-L | 1x | 59.1/59.4 | 51.0/51.6 | 453M | 2162G | [config (test only)](configs/cbnet/htc_cbv2_swin_large_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/htc_cbv2_swin_large22k_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.pth.zip) | -| DB-Swin-L (TTA) | 1x | 59.6/60.1 | 51.8/52.3 | 453M | - | [config (test only)](configs/cbnet/htc_cbv2_swin_large_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/htc_cbv2_swin_large22k_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.pth.zip) | - -TTA denotes test time augmentation. - -**Notes**: - -- **Pre-trained models of Swin Transformer can be downloaded from [Swin Transformer for ImageNet Classification](https://github.com/microsoft/Swin-Transformer)**. - -## Usage - -### Installation - -Please refer to [get_started.md](https://github.com/open-mmlab/mmdetection/blob/master/docs/en/get_started.md) for installation and dataset preparation. - -### Inference -``` -# single-gpu testing (w/o segm result) -python tools/test.py --eval bbox - -# multi-gpu testing (w/ segm result) -tools/dist_test.sh --eval bbox segm -``` - -### Training - -To train a detector with pre-trained models, run: -``` -# multi-gpu training -tools/dist_train.sh -``` -For example, to train a Faster R-CNN model with a `Duel-ResNet50` backbone and 8 gpus, run: -``` -# path of pre-training model (resnet50) is already in config -tools/dist_train.sh configs/cbnet/faster_rcnn_cbv2d1_r50_fpn_1x_coco.py 8 -``` - -Another example, to train a Mask R-CNN model with a `Duel-Swin-T` backbone and 8 gpus, run: -``` -tools/dist_train.sh configs/cbnet/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.py 8 --cfg-options model.pretrained= -``` - - - -### Apex (optional): -Following [Swin Transformer for Object Detection](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection), we use apex for mixed precision training by default. To install apex, run: -``` -git clone https://github.com/NVIDIA/apex -cd apex -pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ -``` - -### Documents and Tutorials -*We list some documents and tutorials from [MMDetection](https://github.com/open-mmlab/mmdetection), which may be helpful to you.* -* [Learn about Configs](https://github.com/open-mmlab/mmdetection/blob/master/docs/tutorials/config.md) -* [Train with customized datasets](https://github.com/open-mmlab/mmdetection/blob/master/docs/2_new_data_model.md) -* [Finetuning Models](https://github.com/open-mmlab/mmdetection/blob/master/docs/tutorials/finetune.md) - - -## Citation -If you use our code/model, please consider to cite our paper [CBNet: A Composite Backbone Network Architecture for Object Detection](http://arxiv.org/abs/2107.00420). -``` -@ARTICLE{9932281, - author={Liang, Tingting and Chu, Xiaojie and Liu, Yudong and Wang, Yongtao and Tang, Zhi and Chu, Wei and Chen, Jingdong and Ling, Haibin}, - journal={IEEE Transactions on Image Processing}, - title={CBNet: A Composite Backbone Network Architecture for Object Detection}, - year={2022}, - volume={31}, - pages={6893-6906}, - doi={10.1109/TIP.2022.3216771}} -``` - -## License -The project is only free for academic research purposes, but needs authorization for commerce. For commerce permission, please contact wyt@pku.edu.cn. - - -## Other Links -> **Original CBNet**: See [CBNet: A Novel Composite Backbone Network Architecture for Object Detection](https://github.com/VDIGPKU/CBNet). +# CBNet: A Composite Backbone Network Architecture for Object Detection +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/cbnetv2-a-composite-backbone-network/object-detection-on-coco)](https://paperswithcode.com/sota/object-detection-on-coco?p=cbnetv2-a-composite-backbone-network) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/cbnetv2-a-composite-backbone-network/instance-segmentation-on-coco)](https://paperswithcode.com/sota/instance-segmentation-on-coco?p=cbnetv2-a-composite-backbone-network) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/cbnetv2-a-composite-backbone-network/object-detection-on-coco-minival)](https://paperswithcode.com/sota/object-detection-on-coco-minival?p=cbnetv2-a-composite-backbone-network) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/cbnetv2-a-composite-backbone-network/instance-segmentation-on-coco-minival)](https://paperswithcode.com/sota/instance-segmentation-on-coco-minival?p=cbnetv2-a-composite-backbone-network) + +By [Tingting Liang](https://github.com/tingtingliangvs)\*, [Xiaojie Chu](https://github.com/chuxiaojie)\*, [Yudong Liu](https://github.com/PKUbahuangliuhe)\*, Yongtao Wang, Zhi Tang, Wei Chu, Jingdong Chen, Haibin Ling. + +This repo is the official implementation of [CBNetV2](http://arxiv.org/abs/2107.00420). It is based on [mmdetection](https://github.com/open-mmlab/mmdetection) and [Swin Transformer for Object Detection](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection). + +Contact us with tingtingliang@pku.edu.cn, chuxiaojie@stu.pku.edu.cn, wyt@pku.edu.cn. +## Introduction +*CBNetV2* achieves strong single-model performance on COCO object detection (`60.1 box AP` and `52.3 mask AP` on test-dev) without extra training data. + +![teaser](figures/cbnetv2.png) + + +## Partial Results and Models +**More results and models can be found in [model zoo](model_zoo.md)** + +### Faster R-CNN +| Backbone | Lr Schd | box mAP (minival) | #params | FLOPs | config | log | model | +| :---------: | :-----: | :---------------: | :-----: | :---: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| DB-ResNet50 | 1x | 40.8 | 69M | 284G | [config](configs/cbnet/faster_rcnn_cbv2d1_r50_fpn_1x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/faster_rcnn_cbv2d1_r50_fpn_1x_coco.log.json) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/faster_rcnn_cbv2d1_r50_fpn_1x_coco.pth.zip) | + + +### Mask R-CNN + +| Backbone | Lr Schd | box mAP (minival) | mask mAP (minival) | #params | FLOPs | config | log | model | +| :-------: | :-----: | :---------------: | :----------------: | :-----: | :---: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| DB-Swin-T | 3x | 50.2 | 44.5 | 76M | 357G | [config](configs/cbnet/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.log.json) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.pth.zip) | + +### Cascade Mask R-CNN (1600x1400) +| Backbone | Lr Schd | box mAP (minival/test-dev) | mask mAP (minival/test-dev) | #params | FLOPs | config | model | +| :-------: | :-----: | :------------------------: | :-------------------------: | :-----: | :---: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| DB-Swin-S | 3x | 56.3/56.9 | 48.6/49.1 | 156M | 1016G | [config](configs/cbnet/cascade_mask_rcnn_cbv2_swin_small_patch4_window7_mstrain_400-1400_adamw_3x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/cascade_mask_rcnn_cbv2_swin_small_patch4_window7_mstrain_400-1400_adamw_3x_coco.pth.zip) | + +### Improved HTC (1600x1400) +*We use ImageNet-22k pretrained checkpoints of Swin-B and Swin-L. Compared to regular HTC, our HTC uses 4conv1fc in bbox head.* +| Backbone | Lr Schd | box mAP (minival/test-dev) | mask mAP (minival/test-dev) | #params | FLOPs | config | model | +| :-------------: | :-----: | :------------------------: | :-------------------------: | :-----: | :---: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| DB-Swin-B | 20e | 58.4/58.7 | 50.7/51.1 | 235M | 1348G | [config](configs/cbnet/htc_cbv2_swin_base_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_20e_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/htc_cbv2_swin_base22k_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_20e_coco.pth.zip) | +| DB-Swin-L | 1x | 59.1/59.4 | 51.0/51.6 | 453M | 2162G | [config (test only)](configs/cbnet/htc_cbv2_swin_large_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/htc_cbv2_swin_large22k_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.pth.zip) | +| DB-Swin-L (TTA) | 1x | 59.6/60.1 | 51.8/52.3 | 453M | - | [config (test only)](configs/cbnet/htc_cbv2_swin_large_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/htc_cbv2_swin_large22k_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.pth.zip) | + +TTA denotes test time augmentation. + +### EVA02 (1536x1536) +| Backbone | Lr Schd | mask mAP (test-dev) | #params | config | model | +| :--------: | :-----: | :-----------------: | :-----: | :----------------------------------------------------------: | :-----------------------------------------------: | +| DB-EVA02-L | 1x | 56.1 | 674M | [config](EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py) | [HF](https://huggingface.co/weeewe/CBNetV2-EVA02) | + +**Notes**: + +- **Pre-trained models of Swin Transformer can be downloaded from [Swin Transformer for ImageNet Classification](https://github.com/microsoft/Swin-Transformer)**. +- **Pre-trained models of EVA02 can be downloaded from [EVA02 pretrain](https://github.com/baaivision/EVA/tree/master/EVA-02/det)**. + +## Usage + +### Installation + +Please refer to [get_started.md](https://github.com/open-mmlab/mmdetection/blob/master/docs/en/get_started.md) for installation and dataset preparation. + +### Inference +``` +# single-gpu testing (w/o segm result) +python tools/test.py --eval bbox + +# multi-gpu testing (w/ segm result) +tools/dist_test.sh --eval bbox segm +``` + +### Training + +To train a detector with pre-trained models, run: +``` +# multi-gpu training +tools/dist_train.sh +``` +For example, to train a Faster R-CNN model with a `Duel-ResNet50` backbone and 8 gpus, run: +``` +# path of pre-training model (resnet50) is already in config +tools/dist_train.sh configs/cbnet/faster_rcnn_cbv2d1_r50_fpn_1x_coco.py 8 +``` + +Another example, to train a Mask R-CNN model with a `Duel-Swin-T` backbone and 8 gpus, run: +``` +tools/dist_train.sh configs/cbnet/mask_rcnn_cbv2_swin_tiny_patch4_window7_mstrain_480-800_adamw_3x_coco.py 8 --cfg-options model.pretrained= +``` + +### Apex (optional): +Following [Swin Transformer for Object Detection](https://github.com/SwinTransformer/Swin-Transformer-Object-Detection), we use apex for mixed precision training by default. To install apex, run: +``` +git clone https://github.com/NVIDIA/apex +cd apex +pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ +``` + +### Documents and Tutorials +*We list some documents and tutorials from [MMDetection](https://github.com/open-mmlab/mmdetection), which may be helpful to you.* +* [Learn about Configs](https://github.com/open-mmlab/mmdetection/blob/master/docs/tutorials/config.md) +* [Train with customized datasets](https://github.com/open-mmlab/mmdetection/blob/master/docs/2_new_data_model.md) +* [Finetuning Models](https://github.com/open-mmlab/mmdetection/blob/master/docs/tutorials/finetune.md) + + +## Citation +If you use our code/model, please consider to cite our paper [CBNet: A Composite Backbone Network Architecture for Object Detection](http://arxiv.org/abs/2107.00420). +``` +@ARTICLE{9932281, + author={Liang, Tingting and Chu, Xiaojie and Liu, Yudong and Wang, Yongtao and Tang, Zhi and Chu, Wei and Chen, Jingdong and Ling, Haibin}, + journal={IEEE Transactions on Image Processing}, + title={CBNet: A Composite Backbone Network Architecture for Object Detection}, + year={2022}, + volume={31}, + pages={6893-6906}, + doi={10.1109/TIP.2022.3216771}} +``` + +## License +The project is only free for academic research purposes, but needs authorization for commerce. For commerce permission, please contact wyt@pku.edu.cn. + + +## Other Links +> **Original CBNet**: See [CBNet: A Novel Composite Backbone Network Architecture for Object Detection](https://github.com/VDIGPKU/CBNet). \ No newline at end of file diff --git a/model_zoo.md b/model_zoo.md index 41663993..00582981 100644 --- a/model_zoo.md +++ b/model_zoo.md @@ -58,7 +58,13 @@ | DB-Swin-L (TTA) | 1x | 59.6/60.1 | 51.8/52.3 | 453M | - | [config](configs/cbnet/htc_cbv2_swin_large_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.py) | [github](https://github.com/CBNetwork/storage/releases/download/v1.0.0/htc_cbv2_swin_large22k_patch4_window7_mstrain_400-1400_giou_4conv1f_adamw_1x_coco.pth.zip) | TTA denotes test time augmentation. - + +### EVA02 (1536x1536) +| Backbone | Lr Schd | mask mAP (test-dev) | #params | config | model | +| :--------: | :-----: | :-----------------: | :-----: | :----------------------------------------------------------: | :-----------------------------------------------: | +| DB-EVA02-L | 1x | 56.1 | 674M | [config](EVA/EVA-02/det/projects/ViTDet/configs/eva2_o365_to_coco/cb_eva2_o365_to_coco_cascade_mask_rcnn_vitdet_l_8attn_1536_lrd0p8.py) | [HF](https://huggingface.co/weeewe/CBNetV2-EVA02) | + + **Notes**: - **Pre-trained models of Swin Transformer can be downloaded from [Swin Transformer for ImageNet Classification](https://github.com/microsoft/Swin-Transformer)**.