diff --git a/data_preparation/README.md b/data_preparation/README.md index ef20e7c..ee69631 100644 --- a/data_preparation/README.md +++ b/data_preparation/README.md @@ -1,4 +1,4 @@ -### Prepare point clouds info files. +### Prepare MMscan info files. Given the licenses of respective raw datasets, we recommend users download the raw data from their official websites and then organize them following the below guide. Detailed steps are shown as follows. @@ -9,8 +9,8 @@ Detailed steps are shown as follows. 3. Download Matterport3D data [HERE](https://github.com/niessner/Matterport). Link or move the folder to this level of directory. -4. Organize the file structure. Under `mmscan_data/embodiedscan-split/embodiedscan-v1`, the directory structure should be as below, - You are recommanded to create a soft link to the raw data folder under `mmsan_data/embodiedscan-split/embodiedscan-v1`. +4. Organize the file structure. Under `mmscan_data/embodiedscan-split`, the directory structure should be as below, + You are recommanded to create a soft link to the raw data folder under `mmsan_data/embodiedscan-split`. ``` data/ @@ -29,16 +29,15 @@ Detailed steps are shown as follows. Additionally, create a `process_pcd` folder in the same directory to store the results. Similarly, we recommend using a symbolic link, as the total file size might be a little large (approximately 21GB) PS: If you have followed the embodiedscan tutorial to organize the data, you can skip these steps and link or copy the `data` folder to - `mmsan_data/embodiedscan-split/embodiedscan-v1`. + `mmsan_data/embodiedscan-split`. After all the raw data is organized, the directory structure should be as below: ``` - embodiedscan-v1/ + embodiedscan-split/ ├── data/ ├── process_pcd/ - ├── embodiedscan_infos_train.pkl - ├── embodiedscan_infos_val.pkl + ├── embodiedscan-v1/ ``` 5. Read raw files and generate processed point cloud files, by running the following scripts. diff --git a/data_preparation/process_all_scan.py b/data_preparation/process_all_scan.py index e5b7022..e984426 100644 --- a/data_preparation/process_all_scan.py +++ b/data_preparation/process_all_scan.py @@ -14,7 +14,7 @@ from utils.scannet_process import process_scannet from utils.trscan_process import process_trscan -dict_1 = {} +es_anno = {} def create_scene_pcd(es_anno, pcd_result): @@ -23,23 +23,18 @@ def create_scene_pcd(es_anno, pcd_result): Args: es_anno (dict): The embodiedscan annotation of the target scan. - pcd_result (tuple) : - (1) aliged point clouds coordinates - shape (n,3) - (2) point clouds color ([0,1]) - shape (n,3) - (3) label (no need here) + pcd_result (tuple) : The raw point cloud data of the scan, + consisting of: + (1) aliged point clouds coordinates with shape (n,3). + (2) point clouds color ([0,1]) with shape (n,3). + (3) label (no need here). Returns: - tuple : - (1) aliged point clouds coordinates - shape (n,3) - (2) point clouds color ([0,1]) - shape (n,3) - (3) point clouds label (int) - shape (n,1) - (4) point clouds object id (int) - shape (n,1) + tuple : The processed point cloud data of the scan, consisting of: + (1) aliged point clouds coordinates with shape (n,3). + (2) point clouds color ([0,1]) with shape (n,3). + (3) point clouds label with shape (n,1). + (4) point clouds object id (int) with shape (n,1). """ pc, color, label = pcd_result label = np.ones_like(label) * -100 @@ -86,17 +81,21 @@ def process_one_scan( ): """Process the point clouds of one scan and save in a pth file. - The pth file is a tuple of: - (1) aliged point clouds coordinates - shape (n,3) - (2) point clouds color ([0,1]) - shape (n,3) - (3) point clouds label (int) - shape (n,1) - (4) point clouds object id (int) - shape (n,1) + The pth file is a tuple of nd.array, consisting of: + (1) aliged point clouds coordinates with shape (n,3). + (2) point clouds color ranging in [0,1] with shape (n,3). + (3) point clouds label with shape (n,1). + (4) point clouds object id with shape (n,1). Args: - scan_id (str): the scan id + scan_id (str): The scan id. + save_root (str): The root path to save the pth file. + scannet_root (str): The path of scannet. + mp3d_root (str): The path of mp3d. + trscan_root (str): The path of 3rscan. + scannet_matrix (nd.array): The aligned matrix of scannet. + mp3d_matrix (nd.array): The aligned matrix of mp3d. + trscan_matrix (nd.array): The aligned matrix of 3rscan. + mp3d_mapping (dict): The mapping dict for mp3d scan id. """ if os.path.exists(f'{save_root}/{scan_id}.pth'): @@ -104,11 +103,11 @@ def process_one_scan( try: if 'scene' in scan_id: - if 'scannet/' + scan_id not in dict_1: + if 'scannet/' + scan_id not in es_anno: return pcd_info = create_scene_pcd( - dict_1['scannet/' + scan_id], + es_anno['scannet/' + scan_id], process_scannet(scan_id, scannet_root, scannet_matrix), ) @@ -118,19 +117,19 @@ def process_one_scan( 'region' + scan_id.split('_region')[1], ) mapping_name = f'matterport3d/{raw_scan_id}/{region_id}' - if mapping_name not in dict_1: + if mapping_name not in es_anno: return pcd_info = create_scene_pcd( - dict_1[mapping_name], + es_anno[mapping_name], process_mp3d(scan_id, mp3d_root, mp3d_matrix, mp3d_mapping), ) else: - if '3rscan/' + scan_id not in dict_1: + if '3rscan/' + scan_id not in es_anno: return pcd_info = create_scene_pcd( - dict_1['3rscan/' + scan_id], + es_anno['3rscan/' + scan_id], process_trscan(scan_id, trscan_root, trscan_matrix), ) @@ -182,8 +181,8 @@ def process_one_scan( TYPE2INT = np.load(args.train_pkl_path, allow_pickle=True)['metainfo']['categories'] - dict_1.update(read_annotation_pickle(args.train_pkl_path)) - dict_1.update(read_annotation_pickle(args.val_pkl_path)) + es_anno.update(read_annotation_pickle(args.train_pkl_path)) + es_anno.update(read_annotation_pickle(args.val_pkl_path)) # loading the required scan id with open(f'{args.meta_path}/all_scan.json', 'r') as f: diff --git a/data_preparation/utils/scannet_process.py b/data_preparation/utils/scannet_process.py index a674fed..b1e666a 100644 --- a/data_preparation/utils/scannet_process.py +++ b/data_preparation/utils/scannet_process.py @@ -18,7 +18,7 @@ def process_scannet(scan_id, data_root, scannet_matrix): r = np.asarray(data_color.elements[0].data['red']) g = np.asarray(data_color.elements[0].data['green']) b = np.asarray(data_color.elements[0].data['blue']) - pc_color = (np.stack([r, g, b], axis=1) / 256.0).astype(np.float32) + pc_color = (np.stack([r, g, b], axis=1) / 255.0).astype(np.float32) axis_align_matrix = scannet_matrix[scan_id] pts = np.ones((pc.shape[0], 4), dtype=pc.dtype) pts[:, :3] = pc diff --git a/mmscan/evaluator/gpt_evaluation.py b/mmscan/evaluator/gpt_evaluation.py index 3c58f56..f796f8b 100644 --- a/mmscan/evaluator/gpt_evaluation.py +++ b/mmscan/evaluator/gpt_evaluation.py @@ -53,9 +53,9 @@ def normal_query(self, The system prompt inputted into GPT. user_content_grounps (list[str]) : The user content inputted into GPT. - max_tokens (int) : max tokens, default 1000. + max_tokens (int) : Max tokens. Defaults to 1000. Returns: - dict : the json-format result. + dict : The json-format result. """ messages = [] @@ -77,13 +77,11 @@ def qa_evaluation(self, QA_sample_dict, thread_index, tmp_path): """Employ the GPT evaluator. Args: - QA_sample_dict (str) : - The system prompt inputted into GPT. - user_content_grounps (list[str]) : - The user content inputted into GPT. - max_tokens (int) : max tokens, default 1000. - Returns: - dict : the json-format result. + QA_sample_dict (str) : The QA sample dict with + [gt, pred, question] as values. + thread_index (int) : The index of the thread. + tmp_path (str) : The path to store the + tmp-stored json files. """ system_prompt, ex_instance = qa_prompt_define() @@ -137,7 +135,7 @@ def qa_collection(self, num_threads, tmp_path): tmp_path (str) : The path to store the tmp-stored json files. Returns: - dict : the evaluation result. + dict : The evaluation result. """ eval_dict = {metric: [] for metric in self.qa_metric} @@ -174,12 +172,12 @@ def load_and_eval(self, raw_batch_input, num_threads=1, tmp_path='./'): Args: raw_batch_input (list[dict]) : - the batch of results wanted to evaluate + The batch of results wanted to evaluate num_threads (int) : The number of the threadings. Defaults to 1. tmp_path (str) : The temporay path to store the json files. Returns: - dict : the evaluation result. + dict : The evaluation result. """ # (1) Update the results and store in the dict. @@ -235,7 +233,10 @@ def __check_format__(self, raw_input): to be checked, should be a list of dict. Every item with the keys: ["ID","question","pred",""gt"] pred is a list with one one element. gt - is a list with >=1 elements. "ID" should be unique!!!! + is a list with >=1 elements. "ID" should be unique. + + Args: + raw_input (list[dict]) : The input to be checked. """ assert isinstance( raw_input, diff --git a/mmscan/evaluator/metrics/box_metric.py b/mmscan/evaluator/metrics/box_metric.py index b5851c7..818e33b 100644 --- a/mmscan/evaluator/metrics/box_metric.py +++ b/mmscan/evaluator/metrics/box_metric.py @@ -13,6 +13,7 @@ def average_precision(recalls, precisions, mode='area'): mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating the average precision of recalls at [0, 0.1, ..., 1] + Defaults to 'area'. Returns: float or np.ndarray: Calculated average precision. @@ -57,7 +58,8 @@ def get_f1_scores(iou_matrix, iou_threshold): Args: iou_matrix (ndarray/tensor): - the iou matrix of the predictions and ground truths (shape n*m) + The iou matrix of the predictions and ground truths with + shape (num_preds , num_gts) iou_threshold (float): 0.25/0.5 Returns: @@ -93,7 +95,7 @@ def __get_fp_tp_array__(iou_array, iou_threshold): Args: iou_array (ndarray/tensor): the iou matrix of the predictions and ground truths - (shape len(preds)*len(gts)) + (shape num_preds, num_gts) iou_threshold (float): 0.25/0.5 Returns: diff --git a/mmscan/evaluator/qa_evaluation.py b/mmscan/evaluator/qa_evaluation.py index 5761c22..e378b54 100644 --- a/mmscan/evaluator/qa_evaluation.py +++ b/mmscan/evaluator/qa_evaluation.py @@ -7,16 +7,16 @@ class QA_Evaluator: - """tradition metrics for QA and Caption evaluation , consists the + """Tradition metrics for QA and Caption evaluation , consists the implements of. [EM, BLEU, METEOR, ROUGE, CIDEr, SPICE, SIMCSE, SBERT] - SIMCSE, SBERT is speacial metrics and needed GPU tools. + SIMCSE, SBERT is speacial metrics and needed GPU. Attributes: - save_buffer(list[dict]): Save the buffer of Inputs - records(list[dict]): Metric results for each sample - metric_record(dict): Metric results for each category + save_buffer(list[dict]): Save the buffer of Inputs. + records(list[dict]): Metric results for each sample. + metric_record(dict): Metric results for each category. (average of all samples with the same category) Args: model_config(dict): The model config for special metric evaluation. @@ -67,18 +67,18 @@ def update(self, batch_input): """Update a batch of results to the buffer, and then filtering and truncating. each item is expected to be a dict with keys. - ["index", "ID","question","pred",""gt"] + ["index", "ID","question","pred","gt"] 1. pred is a list with one one element. 2. gt is a list with >=1 elements. - 3. "ID" should be unique!!!! + 3. "ID" should be unique. Args: batch_input (list[dict]): - a batch of the raw original input + Batch of the raw original input. Returns: Dict: {"EM":EM metric for this batch, - "refined_EM":refined EM metric for this batch} + "refined_EM":Refined EM metric for this batch} """ self.__check_format__(batch_input) @@ -112,7 +112,7 @@ def start_evaluation(self): """Start the evaluation process. Returns: - dict: the metrics + dict: The results of the evaluation. """ # (1) exact match evaluation @@ -170,18 +170,16 @@ def start_evaluation(self): def __check_format__(self, raw_input): """Check if the input conform with mmscan evaluation format. - Args: - The input to be checked, should be a list of dict. - Every item with the keys: - ["index", "ID","question","pred",""gt"] - pred is a list with one one element. - gt is a list with >=1 elements. - "ID" should be unique!!!! + Every item with the keys ["index", "ID","question","pred","gt"], + 'pred' is a list with one one element, 'gt' is a list + with >=1 elements. "ID" should be unique. + Args: + raw_input (list[dict]): The input to be checked. """ assert isinstance( raw_input, - list), 'The input of MMScan evaluator should be a list of dict. ' + list), 'The input of QA evaluator should be a list of dict. ' for _index in range(len(raw_input)): if 'index' not in raw_input[_index]: diff --git a/mmscan/evaluator/vg_evaluation.py b/mmscan/evaluator/vg_evaluation.py index 90ff78b..03cb3f0 100644 --- a/mmscan/evaluator/vg_evaluation.py +++ b/mmscan/evaluator/vg_evaluation.py @@ -10,12 +10,11 @@ class VG_Evaluator: - """Evaluator for MMScan Visual Grounding benchmark. + """Evaluator for MMScan Visual Grounding benchmark. The evaluation metric + includes "AP","AP_C","AR","gTop-k". Attributes: - eval_metric: All the evaluation metric, includes - "AP","AP_C","AR","gTop-k" - save_buffer(list[dict]): Save the buffer of Inputs + save_buffer(list[dict]): Save the buffer of Inputs. records(list[dict]): Metric results for each sample @@ -27,7 +26,7 @@ class VG_Evaluator: """ def __init__(self, verbose=True) -> None: - print('new methods!') + self.verbose = verbose self.eval_metric_type = ['AP', 'AR'] self.top_k_visible = [1, 3, 5] @@ -55,7 +54,7 @@ def update(self, raw_batch_input): Args: raw_batch_input (list[dict]): - a batch of the raw original input + Batch of the raw original input. """ self.__check_format__(raw_batch_input) self.save_buffer.extend(raw_batch_input) @@ -73,8 +72,6 @@ def start_evaluation(self): # (1) len(gt)==0 : skip if self.__is_zero__(data_item['gt_bboxes']): - print('error!!!') - continue # (2) len(pred)==0 : model's wrong @@ -144,9 +141,9 @@ def start_evaluation(self): def collect_result(self): """Collect the result from the evaluation process. - Stores them based on some subclass. + Stores them based on their subclass. Returns: - category_results(dict): Average results per category + category_results(dict): Average results per category. """ category_results = {} category_results['overall'] = {} @@ -186,7 +183,7 @@ def print_result(self): """Showing the result table. Returns: - table(str): the metric result table + table(str): The metric result table. """ assert len(self.category_records) > 0, 'No result yet.' self.category_records = { @@ -246,10 +243,10 @@ def __category_mapping__(self, sub_class): """Mapping the subclass name to the category name. Args: - sub_class (str): the subclass name in the original samples + sub_class (str): The subclass name in the original samples. Returns: - category (str): the category name. + category (str): The category name. """ sub_class = sub_class.lower() sub_class = sub_class.replace('single', 'sngl') @@ -265,10 +262,10 @@ def __calculate_iou_array_(self, data_item): """Calculate some information needed for eavl. Args: - data_item (dict): the subclass name in the original samples + data_item (dict): The subclass name in the original samples. Returns: nd.array, int, int : - the iou array sorted by the confidence, + The iou array sorted by the confidence and the number of predictions, number of gts. """ @@ -297,13 +294,15 @@ def __is_zero__(self, box): return (len(box) == 0) def __check_format__(self, raw_input): - """Check if the input conform with mmscan evaluation format. + """Check if the input conform with mmscan evaluation format. Transform + the input box format. - transform 9 DoF box to ('center'/'size'/'rot_matrix') + Args: + raw_input (list[dict]): The input of VG evaluator. """ assert isinstance( raw_input, - list), 'The input of MMScan evaluator should be a list of dict. ' + list), 'The input of VG evaluator should be a list of dict. ' raw_input = raw_input for _index in tqdm(range(len(raw_input))): diff --git a/mmscan/mmscan.py b/mmscan/mmscan.py index 1554ed1..cc479ab 100644 --- a/mmscan/mmscan.py +++ b/mmscan/mmscan.py @@ -9,7 +9,7 @@ import torch from torch.utils.data import Dataset -from mmscan.utils.box_utils import __9dof_to_6dof__ +from mmscan.utils.box_utils import from_9dof_to_6dof from mmscan.utils.data_io import id_mapping, load_json, read_annotation_pickle from mmscan.utils.task_utils import anno_token_flatten @@ -215,7 +215,7 @@ def __getitem__(self, index_): Input bounding boxes, 9 DoF. Args: - index_ (int): the index + index_ (int): The index. Returns: dict: The sample item corresponding to the index. """ @@ -279,7 +279,7 @@ def get_possess(self, table_name: str, scan_idx: str): """Getting all database about the scan from embodeidscan. Args: - table_name (str): type of the expected data. + table_name (str): The ype of the expected data. scan_idx (str): The scan id to get the data. Returns: The data corresponding to the table_name and scan_idx. @@ -387,9 +387,9 @@ def __process_pcd_info__(self, scan_idx: str): labels and the center of the scan. Args: - scan_idx (str): the scan ID. + scan_idx (str): ID of the scan. Returns: - dict : corresponding scan information. + dict : The corresponding scan information. """ assert (scan_idx in self.embodiedscan_anno.keys() @@ -422,10 +422,9 @@ def __process_box_info__(self, scan_idx: str): bounding boxes in format of [ID: {"bbox":bbox, "type":type},...]. Args: - scan_idx (str): the scan ID. + scan_idx (str): ID of the scan. Returns: - dict : corresponding bounding boxes - information. + dict : The corresponding bounding boxes information. """ assert (scan_idx in self.embodiedscan_anno.keys() ), 'Scan {} is not in {} split'.format(scan_idx, self.split) @@ -447,10 +446,10 @@ def __process_img_info__(self, scan_idx: str): extrinsics, image paths(both rgb & depth) and the visible object ids. Args: - scan_idx (str): the scan ID. + scan_idx (str): ID of the scan. Returns: - list[dict] : corresponding information - for each camera. + list[dict] :The corresponding bounding boxes information + for each camera. """ assert (scan_idx in self.embodiedscan_anno.keys() ), 'Scan {} is not in {} split'.format(scan_idx, self.split) @@ -491,14 +490,14 @@ def down_9dof_to_6dof(self, pcd, box_9dof) -> np.ndarray: The transformed 6DOF bounding box. """ - return __9dof_to_6dof__(pcd, box_9dof) + return from_9dof_to_6dof(pcd, box_9dof) def __downsample_annos__(self, annos: List[dict], ratio: float): """downsample the annotations with a given ratio. Args: - annos (list[dict]): the original annotations. - ratio (float): the ratio to downsample. + annos (list[dict]): The original annotations. + ratio (float): The ratio to downsample. Returns: list[dict] : The result. """ diff --git a/mmscan/utils/box_utils.py b/mmscan/utils/box_utils.py index 68afb88..7e3f764 100644 --- a/mmscan/utils/box_utils.py +++ b/mmscan/utils/box_utils.py @@ -156,7 +156,7 @@ def normalize_box(scene_pcd, embodied_scan_bbox): return bbox -def __9dof_to_6dof__(pcd_data, bbox_): +def from_9dof_to_6dof(pcd_data, bbox_): # that's a kind of loss of information, so we don't recommend return normalize_box(pcd_data, bbox_) @@ -228,7 +228,7 @@ def euler_iou3d_bbox(center1, size1, rot1, center2, size2, rot2): rot1 (Tensor): rot matrix of grounp2. Returns: - numpy.ndarray: (n, m)the 3D IoU + numpy.ndarray: (n, m) the 3D IoU. """ if torch.cuda.is_available(): center1 = center1.cuda() @@ -250,10 +250,10 @@ def box_num(box): """Return the number of boxes in a grounp. Args: - box (list/tuple, tensor): boxes in a grounp. + box (list/tuple, tensor): Boxes in a grounp. Returns: - int : the number + int : The number of boxes. """ if isinstance(box, (list, tuple)): return box[0].shape[0] diff --git a/models/EmbodiedScan/configs/grounding/mv-grounding_1xb1_embodiedscan-tiny-vg-9dof.py b/models/EmbodiedScan/configs/grounding/mv-grounding_1xb1_embodiedscan-tiny-vg-9dof.py deleted file mode 100644 index e402727..0000000 --- a/models/EmbodiedScan/configs/grounding/mv-grounding_1xb1_embodiedscan-tiny-vg-9dof.py +++ /dev/null @@ -1,210 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounder', - num_queries=256, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - backbone=dict( - type='mmdet.ResNet', - depth=50, - base_channels=16, # to make it consistent with mink resnet - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[128, 256, 512, 1024], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'MultiView3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='MultiViewPipeline', - n_images=20, - transforms=[ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadDepthFromFile', backend_args=backend_args), - dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), - dict(type='PointSample', num_points=n_points // 10), - dict(type='Resize', scale=(480, 480), keep_ratio=False) - ]), - dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='MultiViewPipeline', - n_images=50, - ordered=True, - transforms=[ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadDepthFromFile', backend_args=backend_args), - dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), - dict(type='PointSample', num_points=n_points // 10), - dict(type='Resize', scale=(480, 480), keep_ratio=False) - ]), - dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) -] - -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=32, - num_workers=4, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict(type='RepeatDataset', - times=1, - dataset=dict(type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file='embodiedscan_train_mini_vg.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth'))) - -val_dataloader = dict(batch_size=32, - num_workers=4, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict(type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file='embodiedscan_val_tiny_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth')) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetric') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/mv-3ddet/mv-grounding.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-100-vg-9dof.py b/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-100-vg-9dof.py deleted file mode 100644 index a94bbcf..0000000 --- a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-100-vg-9dof.py +++ /dev/null @@ -1,233 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounder', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - backbone=dict( - type='mmdet.ResNet', - depth=50, - base_channels=16, # to make it consistent with mink resnet - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[128, 256, 512, 1024], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'MultiView3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='MultiViewPipeline', - n_images=20, - transforms=[ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadDepthFromFile', backend_args=backend_args), - dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), - dict(type='PointSample', num_points=n_points // 10), - dict(type='Resize', scale=(480, 480), keep_ratio=False) - ]), - dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='MultiViewPipeline', - n_images=50, - ordered=True, - transforms=[ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadDepthFromFile', backend_args=backend_args), - dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), - dict(type='PointSample', num_points=n_points // 10), - dict(type='Resize', scale=(480, 480), keep_ratio=False) - ]), - dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) -] - -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file='es_gen_text/vg_full/VG_val_flattened_token_positive.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader -# test_dataloader = dict(batch_size=12, -# num_workers=12, -# persistent_workers=True, -# drop_last=False, -# sampler=dict(type='DefaultSampler', shuffle=False), -# dataset=dict(type=dataset_type, -# data_root=data_root, -# ann_file='embodiedscan_infos_test.pkl', -# vg_file='embodiedscan_test_vg.json', -# metainfo=metainfo, -# pipeline=test_pipeline, -# test_mode=True, -# filter_empty_gt=True, -# box_type_3d='Euler-Depth', -# tokens_positive_rebuild=True)) - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth' # noqa -load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/ckpts/3ddet.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof-256query.py b/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof-256query.py deleted file mode 100644 index 46446f4..0000000 --- a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof-256query.py +++ /dev/null @@ -1,233 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounder', - num_queries=256, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - backbone=dict( - type='mmdet.ResNet', - depth=50, - base_channels=16, # to make it consistent with mink resnet - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[128, 256, 512, 1024], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'MultiView3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='MultiViewPipeline', - n_images=20, - transforms=[ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadDepthFromFile', backend_args=backend_args), - dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), - dict(type='PointSample', num_points=n_points // 10), - dict(type='Resize', scale=(480, 480), keep_ratio=False) - ]), - dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='MultiViewPipeline', - n_images=50, - ordered=True, - transforms=[ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadDepthFromFile', backend_args=backend_args), - dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), - dict(type='PointSample', num_points=n_points // 10), - dict(type='Resize', scale=(480, 480), keep_ratio=False) - ]), - dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) -] - -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader -# test_dataloader = dict(batch_size=12, -# num_workers=12, -# persistent_workers=True, -# drop_last=False, -# sampler=dict(type='DefaultSampler', shuffle=False), -# dataset=dict(type=dataset_type, -# data_root=data_root, -# ann_file='embodiedscan_infos_test.pkl', -# vg_file='embodiedscan_test_vg.json', -# metainfo=metainfo, -# pipeline=test_pipeline, -# test_mode=True, -# filter_empty_gt=True, -# box_type_3d='Euler-Depth', -# tokens_positive_rebuild=True)) - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof.py deleted file mode 100644 index 78c2d22..0000000 --- a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof.py +++ /dev/null @@ -1,234 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounder', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - backbone=dict( - type='mmdet.ResNet', - depth=50, - base_channels=16, # to make it consistent with mink resnet - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type='BN', requires_grad=False), - norm_eval=True, - init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[128, 256, 512, 1024], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'MultiView3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='MultiViewPipeline', - n_images=20, - transforms=[ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadDepthFromFile', backend_args=backend_args), - dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), - dict(type='PointSample', num_points=n_points // 10), - dict(type='Resize', scale=(480, 480), keep_ratio=False) - ]), - dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='MultiViewPipeline', - n_images=50, - ordered=True, - transforms=[ - dict(type='LoadImageFromFile', backend_args=backend_args), - dict(type='LoadDepthFromFile', backend_args=backend_args), - dict(type='ConvertRGBDToPoints', coord_type='CAMERA'), - dict(type='PointSample', num_points=n_points // 10), - dict(type='Resize', scale=(480, 480), keep_ratio=False) - ]), - dict(type='AggregateMultiViewPoints', coord_type='DEPTH'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d']) -] - -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader -# test_dataloader = dict(batch_size=12, -# num_workers=12, -# persistent_workers=True, -# drop_last=False, -# sampler=dict(type='DefaultSampler', shuffle=False), -# dataset=dict(type=dataset_type, -# data_root=data_root, -# ann_file='embodiedscan_infos_test.pkl', -# vg_file='embodiedscan_test_vg.json', -# metainfo=metainfo, -# pipeline=test_pipeline, -# test_mode=True, -# filter_empty_gt=True, -# box_type_3d='Euler-Depth', -# tokens_positive_rebuild=True)) - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth' # noqa -load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/ckpts/3ddet.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-10-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-10-5-vg-9dof.py deleted file mode 100644 index 29736ce..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-10-5-vg-9dof.py +++ /dev/null @@ -1,199 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=24, - num_workers=4, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_10Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=24, - num_workers=4, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof-nocolor.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof-nocolor.py deleted file mode 100644 index 9b2ffc5..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof-nocolor.py +++ /dev/null @@ -1,198 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline', keep_rgb=False), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline', keep_rgb=False), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=24, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=24, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file='es_gen_text/vg_full/VG_val_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof.py deleted file mode 100644 index c09c1b7..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof.py +++ /dev/null @@ -1,198 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = '/mnt/petrelfs/linjingli/tmp/code/MMScan-code/VG/benchmark/EmbodiedScan/data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=24, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=24, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file='es_gen_text/vg_full/VG_val_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-load.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-load.py deleted file mode 100644 index 57c2622..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-load.py +++ /dev/null @@ -1,199 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=24, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=24, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/ckpts/3ddet.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-nocolor.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-nocolor.py deleted file mode 100644 index 64d267b..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-nocolor.py +++ /dev/null @@ -1,199 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline', keep_rgb=False), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline', keep_rgb=False), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=24, - num_workers=4, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=24, - num_workers=4, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof.py deleted file mode 100644 index 6953544..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof.py +++ /dev/null @@ -1,199 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=24, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=24, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num100.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num100.py new file mode 100644 index 0000000..ffbe6bd --- /dev/null +++ b/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num100.py @@ -0,0 +1,273 @@ + +# edit it +load_from = '/path/to/mv-3ddet.pth' +backend_args = None +custom_hooks = [ + dict(after_iter=True, type='EmptyCacheHook'), +] +data_root = 'data' +dataset_type = 'PointCloud3DGroundingDataset' +default_hooks = dict( + checkpoint=dict(interval=1, max_keep_ckpts=3, type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook')) +default_scope = 'embodiedscan' +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl', port=22873), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +find_unused_parameters = True +launcher = 'slurm' + +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +lr = 0.0005 +metainfo = dict(classes='all') +model = dict( + backbone_3d=dict(depth=34, in_channels=6, type='MinkResNet'), + bbox_head=dict( + contrastive_cfg=dict(bias=True, log_scale='auto', max_text_len=256), + decouple_bbox_loss=True, + decouple_groups=4, + decouple_weights=[ + 0.2, + 0.2, + 0.2, + 0.4, + ], + loss_bbox=dict( + group='g8', loss_weight=1.0, mode='l1', type='BBoxCDLoss'), + loss_cls=dict( + alpha=0.25, + gamma=2.0, + loss_weight=1.0, + type='mmdet.FocalLoss', + use_sigmoid=True), + num_classes=256, + share_pred_layer=True, + sync_cls_avg_factor=True, + type='GroundingHead'), + coord_type='DEPTH', + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_size_divisor=32, + std=[ + 58.395, + 57.12, + 57.375, + ], + type='Det3DDataPreprocessor'), + decoder=dict( + layer_cfg=dict( + cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8), + cross_attn_text_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0), + self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)), + num_layers=6, + post_norm_cfg=None, + return_intermediate=True), + neck_3d=dict( + in_channels=[ + 64, + 128, + 256, + 512, + ], + num_classes=1, + out_channels=256, + pts_prune_threshold=1000, + type='MinkNeck', + voxel_size=0.01), + num_queries=100, + test_cfg=None, + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='BinaryFocalLossCost', weight=1.0), + dict(type='BBox3DL1Cost', weight=2.0), + dict(type='IoU3DCost', weight=2.0), + ], + type='HungarianAssigner3D')), + type='SparseFeatureFusion3DGrounderMod', + use_xyz_feat=True, + voxel_size=0.01) +n_points = 100000 +optim_wrapper = dict( + clip_grad=dict(max_norm=10, norm_type=2), + optimizer=dict(lr=0.0005, type='AdamW', weight_decay=0.0005), + paramwise_cfg=dict( + custom_keys=dict( + decoder=dict(decay_mult=1.0, lr_mult=0.1), + text_encoder=dict(lr_mult=0.0))), + type='OptimWrapper') +param_scheduler = dict( + begin=0, + by_epoch=True, + end=12, + gamma=0.1, + milestones=[ + 8, + 11, + ], + type='MultiStepLR') +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=24, + dataset=dict( + ann_file='embodiedscan_infos_val.pkl', + box_type_3d='Euler-Depth', + data_root='data', + filter_empty_gt=True, + metainfo=dict(classes='all'), + pipeline=[ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), + ], + test_mode=True, + tokens_positive_rebuild=True, + type='MMScanPointCloud3DGroundingDataset', + vg_file= + ''), + drop_last=False, + num_workers=12, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict(type='GroundingMetricMod') +test_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), +] +train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=3) +train_dataloader = dict( + batch_size=24, + dataset=dict( + dataset=dict( + ann_file='embodiedscan_infos_train.pkl', + box_type_3d='Euler-Depth', + data_root='data', + filter_empty_gt=True, + metainfo=dict(classes='all'), + pipeline=[ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + rot_range=[ + -0.087266, + 0.087266, + ], + scale_ratio_range=[ + 0.9, + 1.1, + ], + shift_height=False, + translation_std=[ + 0.1, + 0.1, + 0.1, + ], + type='GlobalRotScaleTrans'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), + ], + test_mode=False, + tokens_positive_rebuild=True, + type='MMScanPointCloud3DGroundingDataset', + vg_file= + '' + ), + times=1, + type='RepeatDataset'), + num_workers=12, + persistent_workers=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + rot_range=[ + -0.087266, + 0.087266, + ], + scale_ratio_range=[ + 0.9, + 1.1, + ], + shift_height=False, + translation_std=[ + 0.1, + 0.1, + 0.1, + ], + type='GlobalRotScaleTrans'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), +] +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=24, + dataset=dict( + ann_file='embodiedscan_infos_val.pkl', + box_type_3d='Euler-Depth', + data_root='data', + filter_empty_gt=True, + metainfo=dict(classes='all'), + pipeline=[ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), + ], + test_mode=True, + tokens_positive_rebuild=True, + type='MMScanPointCloud3DGroundingDataset', + vg_file= + ''), + drop_last=False, + num_workers=12, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict(type='GroundingMetricMod') +work_dir = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-mmscan-grounding-20Per-100queries-load' diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num256.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num256.py new file mode 100644 index 0000000..ebcb458 --- /dev/null +++ b/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num256.py @@ -0,0 +1,273 @@ + +# edit it +load_from = '/path/to/mv-3ddet.pth' +backend_args = None +custom_hooks = [ + dict(after_iter=True, type='EmptyCacheHook'), +] +data_root = 'data' +dataset_type = 'PointCloud3DGroundingDataset' +default_hooks = dict( + checkpoint=dict(interval=1, max_keep_ckpts=3, type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook')) +default_scope = 'embodiedscan' +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl', port=22873), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +find_unused_parameters = True +launcher = 'slurm' + +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +lr = 0.0005 +metainfo = dict(classes='all') +model = dict( + backbone_3d=dict(depth=34, in_channels=6, type='MinkResNet'), + bbox_head=dict( + contrastive_cfg=dict(bias=True, log_scale='auto', max_text_len=256), + decouple_bbox_loss=True, + decouple_groups=4, + decouple_weights=[ + 0.2, + 0.2, + 0.2, + 0.4, + ], + loss_bbox=dict( + group='g8', loss_weight=1.0, mode='l1', type='BBoxCDLoss'), + loss_cls=dict( + alpha=0.25, + gamma=2.0, + loss_weight=1.0, + type='mmdet.FocalLoss', + use_sigmoid=True), + num_classes=256, + share_pred_layer=True, + sync_cls_avg_factor=True, + type='GroundingHead'), + coord_type='DEPTH', + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 123.675, + 116.28, + 103.53, + ], + pad_size_divisor=32, + std=[ + 58.395, + 57.12, + 57.375, + ], + type='Det3DDataPreprocessor'), + decoder=dict( + layer_cfg=dict( + cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8), + cross_attn_text_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0), + self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)), + num_layers=6, + post_norm_cfg=None, + return_intermediate=True), + neck_3d=dict( + in_channels=[ + 64, + 128, + 256, + 512, + ], + num_classes=1, + out_channels=256, + pts_prune_threshold=1000, + type='MinkNeck', + voxel_size=0.01), + num_queries=256, + test_cfg=None, + train_cfg=dict( + assigner=dict( + match_costs=[ + dict(type='BinaryFocalLossCost', weight=1.0), + dict(type='BBox3DL1Cost', weight=2.0), + dict(type='IoU3DCost', weight=2.0), + ], + type='HungarianAssigner3D')), + type='SparseFeatureFusion3DGrounderMod', + use_xyz_feat=True, + voxel_size=0.01) +n_points = 100000 +optim_wrapper = dict( + clip_grad=dict(max_norm=10, norm_type=2), + optimizer=dict(lr=0.0005, type='AdamW', weight_decay=0.0005), + paramwise_cfg=dict( + custom_keys=dict( + decoder=dict(decay_mult=1.0, lr_mult=0.1), + text_encoder=dict(lr_mult=0.0))), + type='OptimWrapper') +param_scheduler = dict( + begin=0, + by_epoch=True, + end=12, + gamma=0.1, + milestones=[ + 8, + 11, + ], + type='MultiStepLR') +resume = False +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=24, + dataset=dict( + ann_file='embodiedscan_infos_val.pkl', + box_type_3d='Euler-Depth', + data_root='data', + filter_empty_gt=True, + metainfo=dict(classes='all'), + pipeline=[ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), + ], + test_mode=True, + tokens_positive_rebuild=True, + type='MMScanPointCloud3DGroundingDataset', + vg_file= + ''), + drop_last=False, + num_workers=12, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict(type='GroundingMetricMod') +test_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), +] +train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=3) +train_dataloader = dict( + batch_size=24, + dataset=dict( + dataset=dict( + ann_file='embodiedscan_infos_train.pkl', + box_type_3d='Euler-Depth', + data_root='data', + filter_empty_gt=True, + metainfo=dict(classes='all'), + pipeline=[ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + rot_range=[ + -0.087266, + 0.087266, + ], + scale_ratio_range=[ + 0.9, + 1.1, + ], + shift_height=False, + translation_std=[ + 0.1, + 0.1, + 0.1, + ], + type='GlobalRotScaleTrans'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), + ], + test_mode=False, + tokens_positive_rebuild=True, + type='MMScanPointCloud3DGroundingDataset', + vg_file= + '' + ), + times=1, + type='RepeatDataset'), + num_workers=12, + persistent_workers=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + rot_range=[ + -0.087266, + 0.087266, + ], + scale_ratio_range=[ + 0.9, + 1.1, + ], + shift_height=False, + translation_std=[ + 0.1, + 0.1, + 0.1, + ], + type='GlobalRotScaleTrans'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), +] +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=24, + dataset=dict( + ann_file='embodiedscan_infos_val.pkl', + box_type_3d='Euler-Depth', + data_root='data', + filter_empty_gt=True, + metainfo=dict(classes='all'), + pipeline=[ + dict(type='LoadAnnotations3D'), + dict(type='DefaultPipeline'), + dict(num_points=100000, type='PointSample'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), + ], + test_mode=True, + tokens_positive_rebuild=True, + type='MMScanPointCloud3DGroundingDataset', + vg_file= + ''), + drop_last=False, + num_workers=12, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict(type='GroundingMetricMod') +work_dir = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-mmscan-grounding-20Per-100queries-load' diff --git a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-100-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-100-5-vg-9dof.py deleted file mode 100644 index 1b147a5..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-100-5-vg-9dof.py +++ /dev/null @@ -1,199 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-50-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-50-5-vg-9dof.py deleted file mode 100644 index efabefc..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-50-5-vg-9dof.py +++ /dev/null @@ -1,199 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_50Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-75-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-75-5-vg-9dof.py deleted file mode 100644 index 703148f..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-75-5-vg-9dof.py +++ /dev/null @@ -1,199 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 - -backend_args = None -# Uncomment the following if use ceph or other file clients. -# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient -# for more details. -# file_client_args = dict( -# backend='petrel', -# path_mapping=dict({ -# './data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/', -# 'data/scannet/': -# 's3://openmmlab/datasets/detection3d/scannet_processed/' -# })) - -metainfo = dict(classes='all') - -model = dict( - type='SparseFeatureFusion3DGrounderMod', - num_queries=100, - voxel_size=0.01, - data_preprocessor=dict(type='Det3DDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32), - # backbone=dict( - # type='mmdet.ResNet', - # depth=50, - # base_channels=16, # to make it consistent with mink resnet - # num_stages=4, - # out_indices=(0, 1, 2, 3), - # frozen_stages=1, - # norm_cfg=dict(type='BN', requires_grad=False), - # norm_eval=True, - # init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), - # style='pytorch'), - backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34), - use_xyz_feat=True, - # change due to no img feature fusion - neck_3d=dict(type='MinkNeck', - num_classes=1, - in_channels=[64, 128, 256, 512], - out_channels=256, - voxel_size=0.01, - pts_prune_threshold=1000), - decoder=dict( - num_layers=6, - return_intermediate=True, - layer_cfg=dict( - # query self attention layer - self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to text - cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - # cross attention layer query to image - cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0)), - post_norm_cfg=None), - bbox_head=dict(type='GroundingHead', - num_classes=256, - sync_cls_avg_factor=True, - decouple_bbox_loss=True, - decouple_groups=4, - share_pred_layer=True, - decouple_weights=[0.2, 0.2, 0.2, 0.4], - contrastive_cfg=dict(max_text_len=256, - log_scale='auto', - bias=True), - loss_cls=dict(type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0), - loss_bbox=dict(type='BBoxCDLoss', - mode='l1', - loss_weight=1.0, - group='g8')), - coord_type='DEPTH', - # training and testing settings - train_cfg=dict(assigner=dict(type='HungarianAssigner3D', - match_costs=[ - dict(type='BinaryFocalLossCost', - weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0) - ]), ), - test_cfg=None) - -dataset_type = 'PointCloud3DGroundingDataset' -data_root = 'data' - -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='GlobalRotScaleTrans', - rot_range=[-0.087266, 0.087266], - scale_ratio_range=[.9, 1.1], - translation_std=[.1, .1, .1], - shift_height=False), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='PointCloudPipeline'), - dict(type='PointSample', num_points=n_points), - dict(type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -# TODO: to determine a reasonable batch size -train_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='RepeatDataset', - times=1, - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_train.pkl', - vg_file= - 'es_gen_text/vg_full/VG_train_75Percent_flattened_token_positive.json', - metainfo=metainfo, - pipeline=train_pipeline, - test_mode=False, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True))) - -val_dataloader = dict( - batch_size=12, - num_workers=12, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='embodiedscan_infos_val.pkl', - vg_file= - 'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json', - # vg_file='embodiedscan_val_mini_vg.json', - metainfo=metainfo, - pipeline=test_pipeline, - test_mode=True, - filter_empty_gt=True, - box_type_3d='Euler-Depth', - tokens_positive_rebuild=True)) -test_dataloader = val_dataloader - -val_evaluator = dict(type='GroundingMetricMod') -test_evaluator = val_evaluator - -# training schedule for 1x -train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') - -# optimizer -lr = 5e-4 -optim_wrapper = dict(type='OptimWrapper', - optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys={ - 'text_encoder': dict(lr_mult=0.0), - 'decoder': dict(lr_mult=0.1, decay_mult=1.0) - }), - clip_grad=dict(max_norm=10, norm_type=2)) - -# learning rate -param_scheduler = dict(type='MultiStepLR', - begin=0, - end=12, - by_epoch=True, - milestones=[8, 11], - gamma=0.1) - -custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)] - -# hooks -default_hooks = dict( - checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3)) - -# vis_backends = [ -# dict(type='TensorboardVisBackend'), -# dict(type='LocalVisBackend') -# ] -# visualizer = dict( -# type='Det3DLocalVisualizer', -# vis_backends=vis_backends, name='visualizer') - -find_unused_parameters = True -# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth' # noqa diff --git a/models/EmbodiedScan/configs/grounding/pcd_vg_mmscan.py b/models/EmbodiedScan/configs/grounding/pcd_vg_mmscan.py deleted file mode 100644 index 81f328d..0000000 --- a/models/EmbodiedScan/configs/grounding/pcd_vg_mmscan.py +++ /dev/null @@ -1,256 +0,0 @@ -_base_ = ['../default_runtime.py'] -n_points = 100000 -backend_args = None -custom_hooks = [ - dict(after_iter=True, type='EmptyCacheHook'), -] -data_root = 'data' -dataset_type = 'MMScanPointCloud3DGroundingDataset' -default_hooks = dict(checkpoint=dict(interval=1, - max_keep_ckpts=3, - type='CheckpointHook'), - logger=dict(interval=50, type='LoggerHook'), - param_scheduler=dict(type='ParamSchedulerHook'), - sampler_seed=dict(type='DistSamplerSeedHook'), - timer=dict(type='IterTimerHook')) -default_scope = 'embodiedscan' -# env_cfg = dict( -# cudnn_benchmark=False, -# dist_cfg=dict(backend='nccl', port=25940), -# mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) -find_unused_parameters = True - -load_from = None -log_level = 'INFO' -log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) -lr = 0.0005 -metainfo = dict(classes='all') -model = dict(backbone_3d=dict(depth=34, in_channels=6, type='MinkResNet'), - bbox_head=dict(contrastive_cfg=dict(bias=True, - log_scale='auto', - max_text_len=256), - decouple_bbox_loss=True, - decouple_groups=4, - decouple_weights=[ - 0.2, - 0.2, - 0.2, - 0.4, - ], - loss_bbox=dict(group='g8', - loss_weight=1.0, - mode='l1', - type='BBoxCDLoss'), - loss_cls=dict(alpha=0.25, - gamma=2.0, - loss_weight=1.0, - type='mmdet.FocalLoss', - use_sigmoid=True), - num_classes=256, - share_pred_layer=True, - sync_cls_avg_factor=True, - type='GroundingHead'), - coord_type='DEPTH', - data_preprocessor=dict(bgr_to_rgb=True, - mean=[ - 123.675, - 116.28, - 103.53, - ], - pad_size_divisor=32, - std=[ - 58.395, - 57.12, - 57.375, - ], - type='Det3DDataPreprocessor'), - decoder=dict(layer_cfg=dict( - cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8), - cross_attn_text_cfg=dict(dropout=0.0, - embed_dims=256, - num_heads=8), - ffn_cfg=dict(embed_dims=256, - feedforward_channels=2048, - ffn_drop=0.0), - self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)), - num_layers=6, - post_norm_cfg=None, - return_intermediate=True), - neck_3d=dict(in_channels=[ - 64, - 128, - 256, - 512, - ], - num_classes=1, - out_channels=256, - pts_prune_threshold=1000, - type='MinkNeck', - voxel_size=0.01), - num_queries=100, - test_cfg=None, - train_cfg=dict(assigner=dict(match_costs=[ - dict(type='BinaryFocalLossCost', weight=1.0), - dict(type='BBox3DL1Cost', weight=2.0), - dict(type='IoU3DCost', weight=2.0), - ], - type='HungarianAssigner3D')), - type='SparseFeatureFusion3DGrounderMod', - use_xyz_feat=True, - voxel_size=0.01) -n_points = 100000 -optim_wrapper = dict( - clip_grad=dict(max_norm=10, norm_type=2), - optimizer=dict(lr=0.0005, type='AdamW', weight_decay=0.0005), - paramwise_cfg=dict( - custom_keys=dict(decoder=dict(decay_mult=1.0, lr_mult=0.1), - text_encoder=dict(lr_mult=0.0))), - type='OptimWrapper') -param_scheduler = dict(begin=0, - by_epoch=True, - end=12, - gamma=0.1, - milestones=[ - 8, - 11, - ], - type='MultiStepLR') -resume = False -test_cfg = dict(type='TestLoop') -test_dataloader = dict(batch_size=24, - dataset=dict(ann_file='', - box_type_3d='Euler-Depth', - data_root='data', - filter_empty_gt=True, - metainfo=dict(classes='all'), - pipeline=[ - dict(type='LoadAnnotations3D'), - dict(type='DefaultPipeline'), - dict(num_points=100000, - type='PointSample'), - dict(keys=[ - 'points', - 'gt_bboxes_3d', - 'gt_labels_3d', - ], - type='Pack3DDetInputs'), - ], - test_mode=True, - tokens_positive_rebuild=True, - type='MMScanPointCloud3DGroundingDataset', - vg_file=''), - drop_last=False, - num_workers=4, - persistent_workers=True, - sampler=dict(shuffle=False, type='DefaultSampler')) -test_evaluator = dict(type='GroundingMetricMod') -test_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='DefaultPipeline'), - dict(num_points=100000, type='PointSample'), - dict(keys=[ - 'points', - 'gt_bboxes_3d', - 'gt_labels_3d', - ], - type='Pack3DDetInputs'), -] - -train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=12) -train_dataloader = dict(batch_size=24, - dataset=dict(dataset=dict( - ann_file='', - box_type_3d='Euler-Depth', - data_root='data', - filter_empty_gt=True, - metainfo=dict(classes='all'), - pipeline=[ - dict(type='LoadAnnotations3D'), - dict(type='DefaultPipeline'), - dict(num_points=100000, type='PointSample'), - dict(rot_range=[ - -0.087266, - 0.087266, - ], - scale_ratio_range=[ - 0.9, - 1.1, - ], - shift_height=False, - translation_std=[ - 0.1, - 0.1, - 0.1, - ], - type='GlobalRotScaleTrans'), - dict(keys=[ - 'points', - 'gt_bboxes_3d', - 'gt_labels_3d', - ], - type='Pack3DDetInputs'), - ], - test_mode=False, - tokens_positive_rebuild=True, - type='MMScanPointCloud3DGroundingDataset', - vg_file=''), - times=1, - type='RepeatDataset'), - num_workers=4, - persistent_workers=True, - sampler=dict(shuffle=True, type='DefaultSampler')) -train_pipeline = [ - dict(type='LoadAnnotations3D'), - dict(type='DefaultPipeline'), - dict(num_points=100000, type='PointSample'), - dict(rot_range=[ - -0.087266, - 0.087266, - ], - scale_ratio_range=[ - 0.9, - 1.1, - ], - shift_height=False, - translation_std=[ - 0.1, - 0.1, - 0.1, - ], - type='GlobalRotScaleTrans'), - dict(keys=[ - 'points', - 'gt_bboxes_3d', - 'gt_labels_3d', - ], - type='Pack3DDetInputs'), -] -val_cfg = dict(type='ValLoop') -val_dataloader = dict(batch_size=24, - dataset=dict(ann_file='', - box_type_3d='Euler-Depth', - data_root='data', - filter_empty_gt=True, - metainfo=dict(classes='all'), - pipeline=[ - dict(type='LoadAnnotations3D'), - dict(type='DefaultPipeline'), - dict(num_points=100000, - type='PointSample'), - dict(keys=[ - 'points', - 'gt_bboxes_3d', - 'gt_labels_3d', - ], - type='Pack3DDetInputs'), - ], - test_mode=True, - tokens_positive_rebuild=True, - type='MMScanPointCloud3DGroundingDataset', - vg_file=' '), - drop_last=False, - num_workers=4, - persistent_workers=True, - sampler=dict(shuffle=False, type='DefaultSampler')) -val_evaluator = dict(type='GroundingMetricMod') -work_dir = 'exps/MMScan-VG-1030' diff --git a/models/EmbodiedScan/embodiedscan/datasets/mmscan_dataset.py b/models/EmbodiedScan/embodiedscan/datasets/mmscan_dataset.py index fc9a438..eee72a2 100644 --- a/models/EmbodiedScan/embodiedscan/datasets/mmscan_dataset.py +++ b/models/EmbodiedScan/embodiedscan/datasets/mmscan_dataset.py @@ -149,7 +149,7 @@ def __init__(self, self.mmscan_loader = MMScan(version='v1', split='val' if test_mode else 'train', task='MMScan-VG', - ratio=0.1) + ratio=0.2) if 'classes' in metainfo: if metainfo['classes'] == 'all': diff --git a/models/README.md b/models/README.md index 109231b..5309e7b 100644 --- a/models/README.md +++ b/models/README.md @@ -1,4 +1,6 @@ -## Visual Grounding Models +## 3D Visual Grounding Models + +These are 3D visual grounding models adapted for the mmscan-devkit. Currently, two models have been released: EmbodiedScan and ScanRefer. ### Scanrefer @@ -22,31 +24,33 @@ ### EmbodiedScan -1. Follow the [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan/blob/main/README.md) to setup the Env. You need not load the datasets! +1. Follow the [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan/blob/main/README.md) to setup the Env. Download the [Multi-View 3D Detection model's weights](https://download.openmmlab.com/mim-example/embodiedscan/mv-3ddet.pth) and change the "load_from" path in the config file under `configs/grounding` to the path where the weights are saved. 2. Install MMScan API. -3. Run the following command to train Scanrefer (multiple GPU): +3. Run the following command to train EmbodiedScan (multiple GPU): ```bash # Single GPU training - python tools/train.py configs/grounding/pcd_vg_mmscan.py --work-dir=path/to/save + python tools/train.py configs/grounding/pcd_4xb24_mmscan_vg_num256.py --work-dir=path/to/save # Multiple GPU training - python tools/train.py configs/grounding/pcd_vg_mmscan.py --work-dir=path/to/save --launcher="pytorch" + python tools/train.py configs/grounding/pcd_4xb24_mmscan_vg_num256.py --work-dir=path/to/save --launcher="pytorch" ``` -4. Run the following command to evaluate Scanrefer (multiple GPU): +4. Run the following command to evaluate EmbodiedScan (multiple GPU): ```bash # Single GPU testing - python tools/test.py configs/grounding/pcd_vg_mmscan.py path/to/load_pth + python tools/test.py configs/grounding/pcd_4xb24_mmscan_vg_num256.py path/to/load_pth # Multiple GPU testing - python tools/test.py configs/grounding/pcd_vg_mmscan.py path/to/load_pth --launcher="pytorch" + python tools/test.py configs/grounding/pcd_4xb24_mmscan_vg_num256.py path/to/load_pth --launcher="pytorch" ``` -## Question Answering Models +## 3D Question Answering Models + +These are 3D question answering models adapted for the mmscan-devkit. Currently, two models have been released: LL3DA and LEO. ### LL3DA @@ -79,7 +83,6 @@ python eval_utils/evaluate_gpt.py --file path/to/qa_pred_gt_val.json --tmp_path path/to/tmp --api_key your_api_key --eval_size -1 --nproc 4 - ``` ### LEO @@ -113,7 +116,6 @@ python evaluator/GPT_eval.py --file path/to/test_embodied_scan_l_complete.json --tmp_path path/to/tmp --api_key your_api_key --eval_size -1 --nproc 4 - ``` PS : It is possible that LEO may encounter an "NaN" error in the MultiHeadAttentionSpatial module due to the training setup when training more epoches. ( no problem for 4GPU one epoch) diff --git a/models/Scanrefer/scripts/train.py b/models/Scanrefer/scripts/train.py index 6dde77f..0443bf4 100644 --- a/models/Scanrefer/scripts/train.py +++ b/models/Scanrefer/scripts/train.py @@ -240,7 +240,7 @@ def train(args): # } # dataloader - train_dataset, train_dataloader = get_dataloader(args, 'train', DC, True) + train_dataset, train_dataloader = get_dataloader(args, 'train', DC, False) val_dataset, val_dataloader = get_dataloader(args, 'val', DC, False) dataloader = {'train': train_dataloader, 'val': val_dataloader}