From 028ecee13ca4eb38011c1b704848346d3902f1f4 Mon Sep 17 00:00:00 2001 From: wjf5203 Date: Thu, 8 Aug 2024 23:31:56 +0800 Subject: [PATCH] add video training and infer scripts for GLEE-Pro and fix improve infer speed --- .../GLEE/configs/videos/Pro/BURST_Pro.yaml | 42 +++++++++++++++++++ projects/GLEE/configs/videos/Pro/TAO_Pro.yaml | 42 +++++++++++++++++++ .../GLEE/configs/videos/Pro/ovis_Pro.yaml | 42 +++++++++++++++++++ .../GLEE/configs/videos/Pro/ytvis19_Pro.yaml | 42 +++++++++++++++++++ projects/GLEE/glee/data/ytvis_eval.py | 5 ++- 5 files changed, 171 insertions(+), 2 deletions(-) create mode 100644 projects/GLEE/configs/videos/Pro/BURST_Pro.yaml create mode 100644 projects/GLEE/configs/videos/Pro/TAO_Pro.yaml create mode 100644 projects/GLEE/configs/videos/Pro/ovis_Pro.yaml create mode 100644 projects/GLEE/configs/videos/Pro/ytvis19_Pro.yaml diff --git a/projects/GLEE/configs/videos/Pro/BURST_Pro.yaml b/projects/GLEE/configs/videos/Pro/BURST_Pro.yaml new file mode 100644 index 0000000..142ce4d --- /dev/null +++ b/projects/GLEE/configs/videos/Pro/BURST_Pro.yaml @@ -0,0 +1,42 @@ +_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml" +MODEL: + CROSS_TRACK: False + PSEUDO_VIDEO: False + FREEZE_WHOLE: False + BACKBONE: + NAME: "D2_EVA02" + EVA02: + CHECKPOINT: False + # PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth' + SEM_SEG_HEAD: + # pixel decoder + PIXEL_DECODER_NAME: "MaskDINOEncoder" + DIM_FEEDFORWARD: 2048 + NUM_FEATURE_LEVELS: 4 + TOTAL_NUM_FEATURE_LEVELS: 4 + IN_FEATURES: ["p3", "p4", "p5", "p6"] + DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"] + COMMON_STRIDE: 4 + TRANSFORMER_ENC_LAYERS: 6 + FEATURE_ORDER: "low2high" +DATASETS: + TRAIN: ("BURST_video_train",) + TEST: ("BURST_video_val",) +SOLVER: + IMS_PER_BATCH: 8 + BASE_LR: 0.0001 + STEPS: (6000, ) + MAX_ITER: 8000 + CHECKPOINT_PERIOD: 2000 +INPUT: + IMAGE_SIZE: 1536 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + FORMAT: "RGB" + DATASET_MAPPER_NAME: "coco_instance_lsj" +TEST: + EVAL_PERIOD: 100000 +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 8 +OUTPUT_DIR: ./GLEE_Pro_BURST \ No newline at end of file diff --git a/projects/GLEE/configs/videos/Pro/TAO_Pro.yaml b/projects/GLEE/configs/videos/Pro/TAO_Pro.yaml new file mode 100644 index 0000000..0b6cc33 --- /dev/null +++ b/projects/GLEE/configs/videos/Pro/TAO_Pro.yaml @@ -0,0 +1,42 @@ +_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml" +MODEL: + CROSS_TRACK: False + PSEUDO_VIDEO: False + FREEZE_WHOLE: False + BACKBONE: + NAME: "D2_EVA02" + EVA02: + CHECKPOINT: False + # PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth' + SEM_SEG_HEAD: + # pixel decoder + PIXEL_DECODER_NAME: "MaskDINOEncoder" + DIM_FEEDFORWARD: 2048 + NUM_FEATURE_LEVELS: 4 + TOTAL_NUM_FEATURE_LEVELS: 4 + IN_FEATURES: ["p3", "p4", "p5", "p6"] + DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"] + COMMON_STRIDE: 4 + TRANSFORMER_ENC_LAYERS: 6 + FEATURE_ORDER: "low2high" +DATASETS: + TRAIN: ("BURST_video_train",) + TEST: ("TAO_video_val",) +SOLVER: + IMS_PER_BATCH: 8 + BASE_LR: 0.0001 + STEPS: (6000, ) + MAX_ITER: 8000 + CHECKPOINT_PERIOD: 2000 +INPUT: + IMAGE_SIZE: 1536 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + FORMAT: "RGB" + DATASET_MAPPER_NAME: "coco_instance_lsj" +TEST: + EVAL_PERIOD: 100000 +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: False + NUM_WORKERS: 8 +OUTPUT_DIR: ./GLEE_Pro_TAO \ No newline at end of file diff --git a/projects/GLEE/configs/videos/Pro/ovis_Pro.yaml b/projects/GLEE/configs/videos/Pro/ovis_Pro.yaml new file mode 100644 index 0000000..fe154c2 --- /dev/null +++ b/projects/GLEE/configs/videos/Pro/ovis_Pro.yaml @@ -0,0 +1,42 @@ +_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml" +MODEL: + CROSS_TRACK: False + PSEUDO_VIDEO: False + FREEZE_WHOLE: False + BACKBONE: + NAME: "D2_EVA02" + EVA02: + CHECKPOINT: False + # PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth' + SEM_SEG_HEAD: + # pixel decoder + PIXEL_DECODER_NAME: "MaskDINOEncoder" + DIM_FEEDFORWARD: 2048 + NUM_FEATURE_LEVELS: 4 + TOTAL_NUM_FEATURE_LEVELS: 4 + IN_FEATURES: ["p3", "p4", "p5", "p6"] + DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"] + COMMON_STRIDE: 4 + TRANSFORMER_ENC_LAYERS: 6 + FEATURE_ORDER: "low2high" +DATASETS: + TRAIN: ("ovis_train",) + TEST: ("ovis_val",) +SOLVER: + IMS_PER_BATCH: 8 + BASE_LR: 0.0001 + STEPS: (12000, ) + MAX_ITER: 18000 + CHECKPOINT_PERIOD: 2000 +INPUT: + IMAGE_SIZE: 1536 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + FORMAT: "RGB" + DATASET_MAPPER_NAME: "coco_instance_lsj" +TEST: + EVAL_PERIOD: 100000 +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: True + NUM_WORKERS: 8 +OUTPUT_DIR: ./GLEE_Pro_ovis \ No newline at end of file diff --git a/projects/GLEE/configs/videos/Pro/ytvis19_Pro.yaml b/projects/GLEE/configs/videos/Pro/ytvis19_Pro.yaml new file mode 100644 index 0000000..21ad6c2 --- /dev/null +++ b/projects/GLEE/configs/videos/Pro/ytvis19_Pro.yaml @@ -0,0 +1,42 @@ +_BASE_: "../../images/Lite/base_clip_frozen_image_R50.yaml" +MODEL: + CROSS_TRACK: False + PSEUDO_VIDEO: False + FREEZE_WHOLE: False + BACKBONE: + NAME: "D2_EVA02" + EVA02: + CHECKPOINT: False + # PRETRAINED_WEIGHT: 'weights/converted_EVA02_m38m_psz14to16.pth' + SEM_SEG_HEAD: + # pixel decoder + PIXEL_DECODER_NAME: "MaskDINOEncoder" + DIM_FEEDFORWARD: 2048 + NUM_FEATURE_LEVELS: 4 + TOTAL_NUM_FEATURE_LEVELS: 4 + IN_FEATURES: ["p3", "p4", "p5", "p6"] + DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["p3", "p4", "p5", "p6"] + COMMON_STRIDE: 4 + TRANSFORMER_ENC_LAYERS: 6 + FEATURE_ORDER: "low2high" +DATASETS: + TRAIN: ("ytvis_2019_train", ) + TEST: ("ytvis_2019_val",) +SOLVER: + IMS_PER_BATCH: 8 + BASE_LR: 0.0001 + STEPS: (6000, ) + MAX_ITER: 8000 + CHECKPOINT_PERIOD: 2000 +INPUT: + IMAGE_SIZE: 1536 + MIN_SCALE: 0.1 + MAX_SCALE: 2.0 + FORMAT: "RGB" + DATASET_MAPPER_NAME: "coco_instance_lsj" +TEST: + EVAL_PERIOD: 100000 +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: True + NUM_WORKERS: 8 +OUTPUT_DIR: ./GLEE_Pro_ytvis19 \ No newline at end of file diff --git a/projects/GLEE/glee/data/ytvis_eval.py b/projects/GLEE/glee/data/ytvis_eval.py index 505f420..c674d80 100755 --- a/projects/GLEE/glee/data/ytvis_eval.py +++ b/projects/GLEE/glee/data/ytvis_eval.py @@ -222,8 +222,9 @@ def instances_to_coco_json_video(inputs, outputs): segms.append(dummy_seg) _boxes.append(None) else: - segms.append(mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0]) - if _mask.sum()>5 and _box is not None: + rle = mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0] + segms.append(rle) + if mask_util.area(rle)>5 and _box is not None: _boxes.append(_box.tolist()) for rle in segms: