diff --git a/data_preparation/README.md b/data_preparation/README.md
index ef20e7c..ee69631 100644
--- a/data_preparation/README.md
+++ b/data_preparation/README.md
@@ -1,4 +1,4 @@
-### Prepare point clouds info files.
+### Prepare MMscan info files.
 
 Given the licenses of respective raw datasets, we recommend users download the raw data from their official websites and then organize them following the below guide.
 Detailed steps are shown as follows.
@@ -9,8 +9,8 @@ Detailed steps are shown as follows.
 
 3. Download Matterport3D data [HERE](https://github.com/niessner/Matterport). Link or move the folder to this level of directory.
 
-4. Organize the file structure. Under `mmscan_data/embodiedscan-split/embodiedscan-v1`, the directory structure should be as below,
-   You are recommanded to create a soft link to the raw data folder under `mmsan_data/embodiedscan-split/embodiedscan-v1`.
+4. Organize the file structure. Under `mmscan_data/embodiedscan-split`, the directory structure should be as below,
+   You are recommanded to create a soft link to the raw data folder under `mmsan_data/embodiedscan-split`.
 
    ```
    data/
@@ -29,16 +29,15 @@ Detailed steps are shown as follows.
    Additionally, create a `process_pcd` folder in the same directory to store the results. Similarly, we recommend using a symbolic link, as the total file size might be a little large (approximately 21GB)
 
    PS: If you have followed the embodiedscan tutorial to organize the data, you can skip these steps and link or copy the `data` folder to
-   `mmsan_data/embodiedscan-split/embodiedscan-v1`.
+   `mmsan_data/embodiedscan-split`.
 
    After all the raw data is organized, the directory structure should be as below:
 
    ```
-   embodiedscan-v1/
+   embodiedscan-split/
    ├── data/
    ├── process_pcd/
-   ├── embodiedscan_infos_train.pkl
-   ├── embodiedscan_infos_val.pkl
+   ├── embodiedscan-v1/
    ```
 
 5. Read raw files and generate processed point cloud files, by running the following scripts.
diff --git a/data_preparation/process_all_scan.py b/data_preparation/process_all_scan.py
index e5b7022..e984426 100644
--- a/data_preparation/process_all_scan.py
+++ b/data_preparation/process_all_scan.py
@@ -14,7 +14,7 @@
 from utils.scannet_process import process_scannet
 from utils.trscan_process import process_trscan
 
-dict_1 = {}
+es_anno = {}
 
 
 def create_scene_pcd(es_anno, pcd_result):
@@ -23,23 +23,18 @@ def create_scene_pcd(es_anno, pcd_result):
     Args:
         es_anno (dict): The embodiedscan annotation of
             the target scan.
-        pcd_result (tuple) :
-            (1) aliged point clouds coordinates
-                shape (n,3)
-            (2) point clouds color ([0,1])
-                shape (n,3)
-            (3) label (no need here)
+        pcd_result (tuple) : The raw point cloud data of the scan,
+            consisting of:
+            (1) aliged point clouds coordinates with shape (n,3).
+            (2) point clouds color ([0,1]) with shape (n,3).
+            (3) label (no need here).
 
     Returns:
-        tuple :
-            (1) aliged point clouds coordinates
-                shape (n,3)
-            (2) point clouds color ([0,1])
-                shape (n,3)
-            (3) point clouds label (int)
-                shape (n,1)
-            (4) point clouds object id (int)
-                shape (n,1)
+        tuple : The processed point cloud data of the scan, consisting of:
+            (1) aliged point clouds coordinates with shape (n,3).
+            (2) point clouds color ([0,1]) with shape (n,3).
+            (3) point clouds label with shape (n,1).
+            (4) point clouds object id (int) with shape (n,1).
     """
     pc, color, label = pcd_result
     label = np.ones_like(label) * -100
@@ -86,17 +81,21 @@ def process_one_scan(
 ):
     """Process the point clouds of one scan and save in a pth file.
 
-    The pth file is a tuple of:
-        (1) aliged point clouds coordinates
-            shape (n,3)
-        (2) point clouds color ([0,1])
-            shape (n,3)
-        (3) point clouds label (int)
-            shape (n,1)
-        (4) point clouds object id (int)
-            shape (n,1)
+    The pth file is a tuple of nd.array, consisting of:
+        (1) aliged point clouds coordinates with shape (n,3).
+        (2) point clouds color ranging in [0,1] with shape (n,3).
+        (3) point clouds label with shape (n,1).
+        (4) point clouds object id with shape (n,1).
     Args:
-        scan_id (str): the scan id
+        scan_id (str): The scan id.
+        save_root (str): The root path to save the pth file.
+        scannet_root (str): The path of scannet.
+        mp3d_root (str): The path of mp3d.
+        trscan_root (str): The path of 3rscan.
+        scannet_matrix (nd.array): The aligned matrix of scannet.
+        mp3d_matrix (nd.array): The aligned matrix of mp3d.
+        trscan_matrix (nd.array): The aligned matrix of 3rscan.
+        mp3d_mapping (dict): The mapping dict for mp3d scan id.
     """
 
     if os.path.exists(f'{save_root}/{scan_id}.pth'):
@@ -104,11 +103,11 @@ def process_one_scan(
 
     try:
         if 'scene' in scan_id:
-            if 'scannet/' + scan_id not in dict_1:
+            if 'scannet/' + scan_id not in es_anno:
                 return
 
             pcd_info = create_scene_pcd(
-                dict_1['scannet/' + scan_id],
+                es_anno['scannet/' + scan_id],
                 process_scannet(scan_id, scannet_root, scannet_matrix),
             )
 
@@ -118,19 +117,19 @@ def process_one_scan(
                 'region' + scan_id.split('_region')[1],
             )
             mapping_name = f'matterport3d/{raw_scan_id}/{region_id}'
-            if mapping_name not in dict_1:
+            if mapping_name not in es_anno:
                 return
 
             pcd_info = create_scene_pcd(
-                dict_1[mapping_name],
+                es_anno[mapping_name],
                 process_mp3d(scan_id, mp3d_root, mp3d_matrix, mp3d_mapping),
             )
 
         else:
-            if '3rscan/' + scan_id not in dict_1:
+            if '3rscan/' + scan_id not in es_anno:
                 return
             pcd_info = create_scene_pcd(
-                dict_1['3rscan/' + scan_id],
+                es_anno['3rscan/' + scan_id],
                 process_trscan(scan_id, trscan_root, trscan_matrix),
             )
 
@@ -182,8 +181,8 @@ def process_one_scan(
 
     TYPE2INT = np.load(args.train_pkl_path,
                        allow_pickle=True)['metainfo']['categories']
-    dict_1.update(read_annotation_pickle(args.train_pkl_path))
-    dict_1.update(read_annotation_pickle(args.val_pkl_path))
+    es_anno.update(read_annotation_pickle(args.train_pkl_path))
+    es_anno.update(read_annotation_pickle(args.val_pkl_path))
 
     # loading the required scan id
     with open(f'{args.meta_path}/all_scan.json', 'r') as f:
diff --git a/data_preparation/utils/scannet_process.py b/data_preparation/utils/scannet_process.py
index a674fed..b1e666a 100644
--- a/data_preparation/utils/scannet_process.py
+++ b/data_preparation/utils/scannet_process.py
@@ -18,7 +18,7 @@ def process_scannet(scan_id, data_root, scannet_matrix):
     r = np.asarray(data_color.elements[0].data['red'])
     g = np.asarray(data_color.elements[0].data['green'])
     b = np.asarray(data_color.elements[0].data['blue'])
-    pc_color = (np.stack([r, g, b], axis=1) / 256.0).astype(np.float32)
+    pc_color = (np.stack([r, g, b], axis=1) / 255.0).astype(np.float32)
     axis_align_matrix = scannet_matrix[scan_id]
     pts = np.ones((pc.shape[0], 4), dtype=pc.dtype)
     pts[:, :3] = pc
diff --git a/mmscan/evaluator/gpt_evaluation.py b/mmscan/evaluator/gpt_evaluation.py
index 3c58f56..f796f8b 100644
--- a/mmscan/evaluator/gpt_evaluation.py
+++ b/mmscan/evaluator/gpt_evaluation.py
@@ -53,9 +53,9 @@ def normal_query(self,
                 The system prompt inputted into GPT.
             user_content_grounps (list[str]) :
                 The user content inputted into GPT.
-            max_tokens (int) : max tokens, default 1000.
+            max_tokens (int) : Max tokens. Defaults to 1000.
         Returns:
-            dict : the json-format result.
+            dict : The json-format result.
         """
 
         messages = []
@@ -77,13 +77,11 @@ def qa_evaluation(self, QA_sample_dict, thread_index, tmp_path):
         """Employ the GPT evaluator.
 
         Args:
-            QA_sample_dict (str) :
-                The system prompt inputted into GPT.
-            user_content_grounps (list[str]) :
-                The user content inputted into GPT.
-            max_tokens (int) : max tokens, default 1000.
-        Returns:
-            dict : the json-format result.
+            QA_sample_dict (str) : The QA sample dict with
+                [gt, pred, question] as values.
+            thread_index (int) : The index of the thread.
+            tmp_path (str) : The path to store the
+                tmp-stored json files.
         """
 
         system_prompt, ex_instance = qa_prompt_define()
@@ -137,7 +135,7 @@ def qa_collection(self, num_threads, tmp_path):
             tmp_path (str) :
                 The path to store the tmp-stored json files.
         Returns:
-            dict : the evaluation result.
+            dict : The evaluation result.
         """
 
         eval_dict = {metric: [] for metric in self.qa_metric}
@@ -174,12 +172,12 @@ def load_and_eval(self, raw_batch_input, num_threads=1, tmp_path='./'):
 
         Args:
             raw_batch_input (list[dict]) :
-                the batch of results wanted to evaluate
+                The batch of results wanted to evaluate
             num_threads (int) : The number of the threadings.
                 Defaults to 1.
             tmp_path (str) : The temporay path to store the json files.
         Returns:
-            dict : the evaluation result.
+            dict : The evaluation result.
         """
 
         # (1) Update the results and store in the dict.
@@ -235,7 +233,10 @@ def __check_format__(self, raw_input):
         to be checked, should be a list of dict. Every item with the keys:
 
         ["ID","question","pred",""gt"] pred is a list with one one element. gt
-        is a list with >=1 elements. "ID" should be unique!!!!
+        is a list with >=1 elements. "ID" should be unique.
+
+        Args:
+            raw_input (list[dict]) : The input to be checked.
         """
         assert isinstance(
             raw_input,
diff --git a/mmscan/evaluator/metrics/box_metric.py b/mmscan/evaluator/metrics/box_metric.py
index b5851c7..818e33b 100644
--- a/mmscan/evaluator/metrics/box_metric.py
+++ b/mmscan/evaluator/metrics/box_metric.py
@@ -13,6 +13,7 @@ def average_precision(recalls, precisions, mode='area'):
         mode (str): 'area' or '11points', 'area' means calculating the area
             under precision-recall curve, '11points' means calculating
             the average precision of recalls at [0, 0.1, ..., 1]
+            Defaults to 'area'.
 
     Returns:
         float or np.ndarray: Calculated average precision.
@@ -57,7 +58,8 @@ def get_f1_scores(iou_matrix, iou_threshold):
 
     Args:
         iou_matrix (ndarray/tensor):
-            the iou matrix of the predictions and ground truths (shape n*m)
+            The iou matrix of the predictions and ground truths with
+                shape (num_preds , num_gts)
         iou_threshold (float): 0.25/0.5
 
     Returns:
@@ -93,7 +95,7 @@ def __get_fp_tp_array__(iou_array, iou_threshold):
     Args:
         iou_array (ndarray/tensor):
             the iou matrix of the predictions and ground truths
-            (shape len(preds)*len(gts))
+            (shape num_preds, num_gts)
         iou_threshold (float): 0.25/0.5
 
     Returns:
diff --git a/mmscan/evaluator/qa_evaluation.py b/mmscan/evaluator/qa_evaluation.py
index 5761c22..e378b54 100644
--- a/mmscan/evaluator/qa_evaluation.py
+++ b/mmscan/evaluator/qa_evaluation.py
@@ -7,16 +7,16 @@
 
 
 class QA_Evaluator:
-    """tradition metrics for QA and Caption evaluation , consists the
+    """Tradition metrics for QA and Caption evaluation , consists the
     implements of.
 
        [EM, BLEU, METEOR, ROUGE, CIDEr, SPICE, SIMCSE, SBERT]
-       SIMCSE, SBERT is speacial metrics and needed GPU tools.
+       SIMCSE, SBERT is speacial metrics and needed GPU.
 
     Attributes:
-        save_buffer(list[dict]): Save the buffer of Inputs
-        records(list[dict]): Metric results for each sample
-        metric_record(dict): Metric results for each category
+        save_buffer(list[dict]): Save the buffer of Inputs.
+        records(list[dict]): Metric results for each sample.
+        metric_record(dict): Metric results for each category.
             (average of all samples with the same category)
     Args:
         model_config(dict): The model config for special metric evaluation.
@@ -67,18 +67,18 @@ def update(self, batch_input):
         """Update a batch of results to the buffer, and then filtering and
         truncating. each item is expected to be a dict with keys.
 
-        ["index", "ID","question","pred",""gt"]
+        ["index", "ID","question","pred","gt"]
 
         1. pred is a list with one one element.
         2. gt is a list with >=1 elements.
-        3. "ID" should be unique!!!!
+        3. "ID" should be unique.
 
         Args:
             batch_input (list[dict]):
-            a batch of the raw original input
+                Batch of the raw original input.
         Returns:
             Dict: {"EM":EM metric for this batch,
-            "refined_EM":refined EM metric for this batch}
+                "refined_EM":Refined EM metric for this batch}
         """
 
         self.__check_format__(batch_input)
@@ -112,7 +112,7 @@ def start_evaluation(self):
         """Start the evaluation process.
 
         Returns:
-            dict: the metrics
+            dict: The results of the evaluation.
         """
 
         # (1) exact match evaluation
@@ -170,18 +170,16 @@ def start_evaluation(self):
 
     def __check_format__(self, raw_input):
         """Check if the input conform with mmscan evaluation format.
-        Args:
-            The input to be checked, should be a list of dict.
-            Every item with the keys:
-            ["index", "ID","question","pred",""gt"]
-            pred is a list with one one element.
-            gt is a list with >=1 elements.
-            "ID" should be unique!!!!
 
+        Every item with the keys ["index", "ID","question","pred","gt"],
+            'pred' is a list with one one element, 'gt' is a list
+            with >=1 elements. "ID" should be unique.
+        Args:
+            raw_input (list[dict]): The input to be checked.
         """
         assert isinstance(
             raw_input,
-            list), 'The input of MMScan evaluator should be a list of dict. '
+            list), 'The input of QA evaluator should be a list of dict. '
 
         for _index in range(len(raw_input)):
             if 'index' not in raw_input[_index]:
diff --git a/mmscan/evaluator/vg_evaluation.py b/mmscan/evaluator/vg_evaluation.py
index 90ff78b..03cb3f0 100644
--- a/mmscan/evaluator/vg_evaluation.py
+++ b/mmscan/evaluator/vg_evaluation.py
@@ -10,12 +10,11 @@
 
 
 class VG_Evaluator:
-    """Evaluator for MMScan Visual Grounding benchmark.
+    """Evaluator for MMScan Visual Grounding benchmark. The evaluation metric
+    includes "AP","AP_C","AR","gTop-k".
 
     Attributes:
-        eval_metric: All the evaluation metric, includes
-            "AP","AP_C","AR","gTop-k"
-        save_buffer(list[dict]): Save the buffer of Inputs
+        save_buffer(list[dict]): Save the buffer of Inputs.
 
         records(list[dict]): Metric results for each sample
 
@@ -27,7 +26,7 @@ class VG_Evaluator:
     """
 
     def __init__(self, verbose=True) -> None:
-        print('new methods!')
+
         self.verbose = verbose
         self.eval_metric_type = ['AP', 'AR']
         self.top_k_visible = [1, 3, 5]
@@ -55,7 +54,7 @@ def update(self, raw_batch_input):
 
         Args:
             raw_batch_input (list[dict]):
-            a batch of the raw original input
+                Batch of the raw original input.
         """
         self.__check_format__(raw_batch_input)
         self.save_buffer.extend(raw_batch_input)
@@ -73,8 +72,6 @@ def start_evaluation(self):
 
             # (1) len(gt)==0 : skip
             if self.__is_zero__(data_item['gt_bboxes']):
-                print('error!!!')
-
                 continue
 
             # (2) len(pred)==0 : model's wrong
@@ -144,9 +141,9 @@ def start_evaluation(self):
     def collect_result(self):
         """Collect the result from the evaluation process.
 
-        Stores them based on some subclass.
+        Stores them based on their subclass.
         Returns:
-             category_results(dict): Average results per category
+             category_results(dict): Average results per category.
         """
         category_results = {}
         category_results['overall'] = {}
@@ -186,7 +183,7 @@ def print_result(self):
         """Showing the result table.
 
         Returns:
-            table(str): the metric result table
+            table(str): The metric result table.
         """
         assert len(self.category_records) > 0, 'No result yet.'
         self.category_records = {
@@ -246,10 +243,10 @@ def __category_mapping__(self, sub_class):
         """Mapping the subclass name to the category name.
 
         Args:
-            sub_class (str): the subclass name in the original samples
+            sub_class (str): The subclass name in the original samples.
 
         Returns:
-            category (str): the category name.
+            category (str): The category name.
         """
         sub_class = sub_class.lower()
         sub_class = sub_class.replace('single', 'sngl')
@@ -265,10 +262,10 @@ def __calculate_iou_array_(self, data_item):
         """Calculate some information needed for eavl.
 
         Args:
-             data_item (dict): the subclass name in the original samples
+             data_item (dict): The subclass name in the original samples.
         Returns:
              nd.array, int, int :
-                the iou array sorted by the confidence,
+                The iou array sorted by the confidence and the
                 number of predictions, number of gts.
         """
 
@@ -297,13 +294,15 @@ def __is_zero__(self, box):
         return (len(box) == 0)
 
     def __check_format__(self, raw_input):
-        """Check if the input conform with mmscan evaluation format.
+        """Check if the input conform with mmscan evaluation format. Transform
+        the input box format.
 
-        transform 9 DoF box to ('center'/'size'/'rot_matrix')
+        Args:
+            raw_input (list[dict]): The input of VG evaluator.
         """
         assert isinstance(
             raw_input,
-            list), 'The input of MMScan evaluator should be a list of dict. '
+            list), 'The input of VG evaluator should be a list of dict. '
         raw_input = raw_input
 
         for _index in tqdm(range(len(raw_input))):
diff --git a/mmscan/mmscan.py b/mmscan/mmscan.py
index 1554ed1..cc479ab 100644
--- a/mmscan/mmscan.py
+++ b/mmscan/mmscan.py
@@ -9,7 +9,7 @@
 import torch
 from torch.utils.data import Dataset
 
-from mmscan.utils.box_utils import __9dof_to_6dof__
+from mmscan.utils.box_utils import from_9dof_to_6dof
 from mmscan.utils.data_io import id_mapping, load_json, read_annotation_pickle
 from mmscan.utils.task_utils import anno_token_flatten
 
@@ -215,7 +215,7 @@ def __getitem__(self, index_):
                         Input bounding boxes, 9 DoF.
 
         Args:
-            index_ (int): the index
+            index_ (int): The index.
         Returns:
             dict: The sample item corresponding to the index.
         """
@@ -279,7 +279,7 @@ def get_possess(self, table_name: str, scan_idx: str):
         """Getting all database about the scan from embodeidscan.
 
         Args:
-            table_name (str): type of the expected data.
+            table_name (str): The ype of the expected data.
             scan_idx (str): The scan id to get the data.
         Returns:
             The data corresponding to the table_name and scan_idx.
@@ -387,9 +387,9 @@ def __process_pcd_info__(self, scan_idx: str):
         labels and the center of the scan.
 
         Args:
-            scan_idx (str): the scan ID.
+            scan_idx (str): ID of the scan.
         Returns:
-            dict : corresponding scan information.
+            dict : The corresponding scan information.
         """
 
         assert (scan_idx in self.embodiedscan_anno.keys()
@@ -422,10 +422,9 @@ def __process_box_info__(self, scan_idx: str):
         bounding boxes in format of [ID: {"bbox":bbox, "type":type},...].
 
         Args:
-            scan_idx (str): the scan ID.
+            scan_idx (str): ID of the scan.
         Returns:
-            dict : corresponding bounding boxes
-            information.
+            dict : The corresponding bounding boxes information.
         """
         assert (scan_idx in self.embodiedscan_anno.keys()
                 ), 'Scan {} is not in {} split'.format(scan_idx, self.split)
@@ -447,10 +446,10 @@ def __process_img_info__(self, scan_idx: str):
         extrinsics, image paths(both rgb & depth) and the visible object ids.
 
         Args:
-            scan_idx (str): the scan ID.
+            scan_idx (str): ID of the scan.
         Returns:
-            list[dict] : corresponding information
-            for each camera.
+            list[dict] :The corresponding bounding boxes information
+                for each camera.
         """
         assert (scan_idx in self.embodiedscan_anno.keys()
                 ), 'Scan {} is not in {} split'.format(scan_idx, self.split)
@@ -491,14 +490,14 @@ def down_9dof_to_6dof(self, pcd, box_9dof) -> np.ndarray:
                 The transformed 6DOF bounding box.
         """
 
-        return __9dof_to_6dof__(pcd, box_9dof)
+        return from_9dof_to_6dof(pcd, box_9dof)
 
     def __downsample_annos__(self, annos: List[dict], ratio: float):
         """downsample the annotations with a given ratio.
 
         Args:
-            annos (list[dict]): the original annotations.
-            ratio (float): the ratio to downsample.
+            annos (list[dict]): The original annotations.
+            ratio (float): The ratio to downsample.
         Returns:
             list[dict] : The result.
         """
diff --git a/mmscan/utils/box_utils.py b/mmscan/utils/box_utils.py
index 68afb88..7e3f764 100644
--- a/mmscan/utils/box_utils.py
+++ b/mmscan/utils/box_utils.py
@@ -156,7 +156,7 @@ def normalize_box(scene_pcd, embodied_scan_bbox):
     return bbox
 
 
-def __9dof_to_6dof__(pcd_data, bbox_):
+def from_9dof_to_6dof(pcd_data, bbox_):
     # that's a kind of loss of information, so we don't recommend
     return normalize_box(pcd_data, bbox_)
 
@@ -228,7 +228,7 @@ def euler_iou3d_bbox(center1, size1, rot1, center2, size2, rot2):
         rot1 (Tensor): rot matrix of grounp2.
 
     Returns:
-        numpy.ndarray: (n, m)the 3D IoU
+        numpy.ndarray: (n, m) the 3D IoU.
     """
     if torch.cuda.is_available():
         center1 = center1.cuda()
@@ -250,10 +250,10 @@ def box_num(box):
     """Return the number of boxes in a grounp.
 
     Args:
-        box (list/tuple, tensor): boxes in a grounp.
+        box (list/tuple, tensor): Boxes in a grounp.
 
     Returns:
-        int : the number
+        int : The number of boxes.
     """
     if isinstance(box, (list, tuple)):
         return box[0].shape[0]
diff --git a/models/EmbodiedScan/configs/grounding/mv-grounding_1xb1_embodiedscan-tiny-vg-9dof.py b/models/EmbodiedScan/configs/grounding/mv-grounding_1xb1_embodiedscan-tiny-vg-9dof.py
deleted file mode 100644
index e402727..0000000
--- a/models/EmbodiedScan/configs/grounding/mv-grounding_1xb1_embodiedscan-tiny-vg-9dof.py
+++ /dev/null
@@ -1,210 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounder',
-    num_queries=256,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    backbone=dict(
-        type='mmdet.ResNet',
-        depth=50,
-        base_channels=16,  # to make it consistent with mink resnet
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=False),
-        norm_eval=True,
-        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-        style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[128, 256, 512, 1024],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'MultiView3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='MultiViewPipeline',
-         n_images=20,
-         transforms=[
-             dict(type='LoadImageFromFile', backend_args=backend_args),
-             dict(type='LoadDepthFromFile', backend_args=backend_args),
-             dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
-             dict(type='PointSample', num_points=n_points // 10),
-             dict(type='Resize', scale=(480, 480), keep_ratio=False)
-         ]),
-    dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='MultiViewPipeline',
-         n_images=50,
-         ordered=True,
-         transforms=[
-             dict(type='LoadImageFromFile', backend_args=backend_args),
-             dict(type='LoadDepthFromFile', backend_args=backend_args),
-             dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
-             dict(type='PointSample', num_points=n_points // 10),
-             dict(type='Resize', scale=(480, 480), keep_ratio=False)
-         ]),
-    dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=32,
-    num_workers=4,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(type='RepeatDataset',
-                 times=1,
-                 dataset=dict(type=dataset_type,
-                              data_root=data_root,
-                              ann_file='embodiedscan_infos_train.pkl',
-                              vg_file='embodiedscan_train_mini_vg.json',
-                              metainfo=metainfo,
-                              pipeline=train_pipeline,
-                              test_mode=False,
-                              filter_empty_gt=True,
-                              box_type_3d='Euler-Depth')))
-
-val_dataloader = dict(batch_size=32,
-                      num_workers=4,
-                      persistent_workers=True,
-                      drop_last=False,
-                      sampler=dict(type='DefaultSampler', shuffle=False),
-                      dataset=dict(type=dataset_type,
-                                   data_root=data_root,
-                                   ann_file='embodiedscan_infos_val.pkl',
-                                   vg_file='embodiedscan_val_tiny_vg.json',
-                                   metainfo=metainfo,
-                                   pipeline=test_pipeline,
-                                   test_mode=True,
-                                   filter_empty_gt=True,
-                                   box_type_3d='Euler-Depth'))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetric')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/mv-3ddet/mv-grounding.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-100-vg-9dof.py b/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-100-vg-9dof.py
deleted file mode 100644
index a94bbcf..0000000
--- a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-100-vg-9dof.py
+++ /dev/null
@@ -1,233 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounder',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    backbone=dict(
-        type='mmdet.ResNet',
-        depth=50,
-        base_channels=16,  # to make it consistent with mink resnet
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=False),
-        norm_eval=True,
-        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-        style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[128, 256, 512, 1024],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'MultiView3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='MultiViewPipeline',
-         n_images=20,
-         transforms=[
-             dict(type='LoadImageFromFile', backend_args=backend_args),
-             dict(type='LoadDepthFromFile', backend_args=backend_args),
-             dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
-             dict(type='PointSample', num_points=n_points // 10),
-             dict(type='Resize', scale=(480, 480), keep_ratio=False)
-         ]),
-    dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='MultiViewPipeline',
-         n_images=50,
-         ordered=True,
-         transforms=[
-             dict(type='LoadImageFromFile', backend_args=backend_args),
-             dict(type='LoadDepthFromFile', backend_args=backend_args),
-             dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
-             dict(type='PointSample', num_points=n_points // 10),
-             dict(type='Resize', scale=(480, 480), keep_ratio=False)
-         ]),
-    dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file='es_gen_text/vg_full/VG_val_flattened_token_positive.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-# test_dataloader = dict(batch_size=12,
-#                        num_workers=12,
-#                        persistent_workers=True,
-#                        drop_last=False,
-#                        sampler=dict(type='DefaultSampler', shuffle=False),
-#                        dataset=dict(type=dataset_type,
-#                                     data_root=data_root,
-#                                     ann_file='embodiedscan_infos_test.pkl',
-#                                     vg_file='embodiedscan_test_vg.json',
-#                                     metainfo=metainfo,
-#                                     pipeline=test_pipeline,
-#                                     test_mode=True,
-#                                     filter_empty_gt=True,
-#                                     box_type_3d='Euler-Depth',
-#                                     tokens_positive_rebuild=True))
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth'  # noqa
-load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/ckpts/3ddet.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof-256query.py b/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof-256query.py
deleted file mode 100644
index 46446f4..0000000
--- a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof-256query.py
+++ /dev/null
@@ -1,233 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounder',
-    num_queries=256,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    backbone=dict(
-        type='mmdet.ResNet',
-        depth=50,
-        base_channels=16,  # to make it consistent with mink resnet
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=False),
-        norm_eval=True,
-        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-        style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[128, 256, 512, 1024],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'MultiView3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='MultiViewPipeline',
-         n_images=20,
-         transforms=[
-             dict(type='LoadImageFromFile', backend_args=backend_args),
-             dict(type='LoadDepthFromFile', backend_args=backend_args),
-             dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
-             dict(type='PointSample', num_points=n_points // 10),
-             dict(type='Resize', scale=(480, 480), keep_ratio=False)
-         ]),
-    dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='MultiViewPipeline',
-         n_images=50,
-         ordered=True,
-         transforms=[
-             dict(type='LoadImageFromFile', backend_args=backend_args),
-             dict(type='LoadDepthFromFile', backend_args=backend_args),
-             dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
-             dict(type='PointSample', num_points=n_points // 10),
-             dict(type='Resize', scale=(480, 480), keep_ratio=False)
-         ]),
-    dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-# test_dataloader = dict(batch_size=12,
-#                        num_workers=12,
-#                        persistent_workers=True,
-#                        drop_last=False,
-#                        sampler=dict(type='DefaultSampler', shuffle=False),
-#                        dataset=dict(type=dataset_type,
-#                                     data_root=data_root,
-#                                     ann_file='embodiedscan_infos_test.pkl',
-#                                     vg_file='embodiedscan_test_vg.json',
-#                                     metainfo=metainfo,
-#                                     pipeline=test_pipeline,
-#                                     test_mode=True,
-#                                     filter_empty_gt=True,
-#                                     box_type_3d='Euler-Depth',
-#                                     tokens_positive_rebuild=True))
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof.py
deleted file mode 100644
index 78c2d22..0000000
--- a/models/EmbodiedScan/configs/grounding/mv_8xb12-mmscan-20-5-vg-9dof.py
+++ /dev/null
@@ -1,234 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounder',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    backbone=dict(
-        type='mmdet.ResNet',
-        depth=50,
-        base_channels=16,  # to make it consistent with mink resnet
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=False),
-        norm_eval=True,
-        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-        style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[128, 256, 512, 1024],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'MultiView3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='MultiViewPipeline',
-         n_images=20,
-         transforms=[
-             dict(type='LoadImageFromFile', backend_args=backend_args),
-             dict(type='LoadDepthFromFile', backend_args=backend_args),
-             dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
-             dict(type='PointSample', num_points=n_points // 10),
-             dict(type='Resize', scale=(480, 480), keep_ratio=False)
-         ]),
-    dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='MultiViewPipeline',
-         n_images=50,
-         ordered=True,
-         transforms=[
-             dict(type='LoadImageFromFile', backend_args=backend_args),
-             dict(type='LoadDepthFromFile', backend_args=backend_args),
-             dict(type='ConvertRGBDToPoints', coord_type='CAMERA'),
-             dict(type='PointSample', num_points=n_points // 10),
-             dict(type='Resize', scale=(480, 480), keep_ratio=False)
-         ]),
-    dict(type='AggregateMultiViewPoints', coord_type='DEPTH'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-# test_dataloader = dict(batch_size=12,
-#                        num_workers=12,
-#                        persistent_workers=True,
-#                        drop_last=False,
-#                        sampler=dict(type='DefaultSampler', shuffle=False),
-#                        dataset=dict(type=dataset_type,
-#                                     data_root=data_root,
-#                                     ann_file='embodiedscan_infos_test.pkl',
-#                                     vg_file='embodiedscan_test_vg.json',
-#                                     metainfo=metainfo,
-#                                     pipeline=test_pipeline,
-#                                     test_mode=True,
-#                                     filter_empty_gt=True,
-#                                     box_type_3d='Euler-Depth',
-#                                     tokens_positive_rebuild=True))
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/wangtai/EmbodiedScan/work_dirs/mv-3ddet-challenge/epoch_12.pth'  # noqa
-load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/ckpts/3ddet.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-10-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-10-5-vg-9dof.py
deleted file mode 100644
index 29736ce..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-10-5-vg-9dof.py
+++ /dev/null
@@ -1,199 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=24,
-    num_workers=4,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_10Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=24,
-    num_workers=4,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof-nocolor.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof-nocolor.py
deleted file mode 100644
index 9b2ffc5..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof-nocolor.py
+++ /dev/null
@@ -1,198 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline', keep_rgb=False),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline', keep_rgb=False),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=24,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=24,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file='es_gen_text/vg_full/VG_val_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof.py
deleted file mode 100644
index c09c1b7..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-100-vg-9dof.py
+++ /dev/null
@@ -1,198 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = '/mnt/petrelfs/linjingli/tmp/code/MMScan-code/VG/benchmark/EmbodiedScan/data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=24,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=24,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file='es_gen_text/vg_full/VG_val_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-load.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-load.py
deleted file mode 100644
index 57c2622..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-load.py
+++ /dev/null
@@ -1,199 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=24,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=24,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/ckpts/3ddet.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-nocolor.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-nocolor.py
deleted file mode 100644
index 64d267b..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof-nocolor.py
+++ /dev/null
@@ -1,199 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=3, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline', keep_rgb=False),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline', keep_rgb=False),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=24,
-    num_workers=4,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=24,
-    num_workers=4,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof.py
deleted file mode 100644
index 6953544..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_4xb24-mmscan-20-5-vg-9dof.py
+++ /dev/null
@@ -1,199 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=24,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_20Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=24,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num100.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num100.py
new file mode 100644
index 0000000..ffbe6bd
--- /dev/null
+++ b/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num100.py
@@ -0,0 +1,273 @@
+
+# edit it
+load_from = '/path/to/mv-3ddet.pth'
+backend_args = None
+custom_hooks = [
+    dict(after_iter=True, type='EmptyCacheHook'),
+]
+data_root = 'data'
+dataset_type = 'PointCloud3DGroundingDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=1, max_keep_ckpts=3, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'))
+default_scope = 'embodiedscan'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl', port=22873),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+find_unused_parameters = True
+launcher = 'slurm'
+
+log_level = 'INFO'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+lr = 0.0005
+metainfo = dict(classes='all')
+model = dict(
+    backbone_3d=dict(depth=34, in_channels=6, type='MinkResNet'),
+    bbox_head=dict(
+        contrastive_cfg=dict(bias=True, log_scale='auto', max_text_len=256),
+        decouple_bbox_loss=True,
+        decouple_groups=4,
+        decouple_weights=[
+            0.2,
+            0.2,
+            0.2,
+            0.4,
+        ],
+        loss_bbox=dict(
+            group='g8', loss_weight=1.0, mode='l1', type='BBoxCDLoss'),
+        loss_cls=dict(
+            alpha=0.25,
+            gamma=2.0,
+            loss_weight=1.0,
+            type='mmdet.FocalLoss',
+            use_sigmoid=True),
+        num_classes=256,
+        share_pred_layer=True,
+        sync_cls_avg_factor=True,
+        type='GroundingHead'),
+    coord_type='DEPTH',
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_size_divisor=32,
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='Det3DDataPreprocessor'),
+    decoder=dict(
+        layer_cfg=dict(
+            cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8),
+            cross_attn_text_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0),
+            self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)),
+        num_layers=6,
+        post_norm_cfg=None,
+        return_intermediate=True),
+    neck_3d=dict(
+        in_channels=[
+            64,
+            128,
+            256,
+            512,
+        ],
+        num_classes=1,
+        out_channels=256,
+        pts_prune_threshold=1000,
+        type='MinkNeck',
+        voxel_size=0.01),
+    num_queries=100,
+    test_cfg=None,
+    train_cfg=dict(
+        assigner=dict(
+            match_costs=[
+                dict(type='BinaryFocalLossCost', weight=1.0),
+                dict(type='BBox3DL1Cost', weight=2.0),
+                dict(type='IoU3DCost', weight=2.0),
+            ],
+            type='HungarianAssigner3D')),
+    type='SparseFeatureFusion3DGrounderMod',
+    use_xyz_feat=True,
+    voxel_size=0.01)
+n_points = 100000
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=10, norm_type=2),
+    optimizer=dict(lr=0.0005, type='AdamW', weight_decay=0.0005),
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            decoder=dict(decay_mult=1.0, lr_mult=0.1),
+            text_encoder=dict(lr_mult=0.0))),
+    type='OptimWrapper')
+param_scheduler = dict(
+    begin=0,
+    by_epoch=True,
+    end=12,
+    gamma=0.1,
+    milestones=[
+        8,
+        11,
+    ],
+    type='MultiStepLR')
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=24,
+    dataset=dict(
+        ann_file='embodiedscan_infos_val.pkl',
+        box_type_3d='Euler-Depth',
+        data_root='data',
+        filter_empty_gt=True,
+        metainfo=dict(classes='all'),
+        pipeline=[
+            dict(type='LoadAnnotations3D'),
+            dict(type='DefaultPipeline'),
+            dict(num_points=100000, type='PointSample'),
+            dict(
+                keys=[
+                    'points',
+                    'gt_bboxes_3d',
+                    'gt_labels_3d',
+                ],
+                type='Pack3DDetInputs'),
+        ],
+        test_mode=True,
+        tokens_positive_rebuild=True,
+        type='MMScanPointCloud3DGroundingDataset',
+        vg_file=
+        ''),
+    drop_last=False,
+    num_workers=12,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(type='GroundingMetricMod')
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(type='DefaultPipeline'),
+    dict(num_points=100000, type='PointSample'),
+    dict(
+        keys=[
+            'points',
+            'gt_bboxes_3d',
+            'gt_labels_3d',
+        ],
+        type='Pack3DDetInputs'),
+]
+train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=3)
+train_dataloader = dict(
+    batch_size=24,
+    dataset=dict(
+        dataset=dict(
+            ann_file='embodiedscan_infos_train.pkl',
+            box_type_3d='Euler-Depth',
+            data_root='data',
+            filter_empty_gt=True,
+            metainfo=dict(classes='all'),
+            pipeline=[
+                dict(type='LoadAnnotations3D'),
+                dict(type='DefaultPipeline'),
+                dict(num_points=100000, type='PointSample'),
+                dict(
+                    rot_range=[
+                        -0.087266,
+                        0.087266,
+                    ],
+                    scale_ratio_range=[
+                        0.9,
+                        1.1,
+                    ],
+                    shift_height=False,
+                    translation_std=[
+                        0.1,
+                        0.1,
+                        0.1,
+                    ],
+                    type='GlobalRotScaleTrans'),
+                dict(
+                    keys=[
+                        'points',
+                        'gt_bboxes_3d',
+                        'gt_labels_3d',
+                    ],
+                    type='Pack3DDetInputs'),
+            ],
+            test_mode=False,
+            tokens_positive_rebuild=True,
+            type='MMScanPointCloud3DGroundingDataset',
+            vg_file=
+            ''
+        ),
+        times=1,
+        type='RepeatDataset'),
+    num_workers=12,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(type='DefaultPipeline'),
+    dict(num_points=100000, type='PointSample'),
+    dict(
+        rot_range=[
+            -0.087266,
+            0.087266,
+        ],
+        scale_ratio_range=[
+            0.9,
+            1.1,
+        ],
+        shift_height=False,
+        translation_std=[
+            0.1,
+            0.1,
+            0.1,
+        ],
+        type='GlobalRotScaleTrans'),
+    dict(
+        keys=[
+            'points',
+            'gt_bboxes_3d',
+            'gt_labels_3d',
+        ],
+        type='Pack3DDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=24,
+    dataset=dict(
+        ann_file='embodiedscan_infos_val.pkl',
+        box_type_3d='Euler-Depth',
+        data_root='data',
+        filter_empty_gt=True,
+        metainfo=dict(classes='all'),
+        pipeline=[
+            dict(type='LoadAnnotations3D'),
+            dict(type='DefaultPipeline'),
+            dict(num_points=100000, type='PointSample'),
+            dict(
+                keys=[
+                    'points',
+                    'gt_bboxes_3d',
+                    'gt_labels_3d',
+                ],
+                type='Pack3DDetInputs'),
+        ],
+        test_mode=True,
+        tokens_positive_rebuild=True,
+        type='MMScanPointCloud3DGroundingDataset',
+        vg_file=
+        ''),
+    drop_last=False,
+    num_workers=12,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(type='GroundingMetricMod')
+work_dir = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-mmscan-grounding-20Per-100queries-load'
diff --git a/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num256.py b/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num256.py
new file mode 100644
index 0000000..ebcb458
--- /dev/null
+++ b/models/EmbodiedScan/configs/grounding/pcd_4xb24_mmscan_vg_num256.py
@@ -0,0 +1,273 @@
+
+# edit it
+load_from = '/path/to/mv-3ddet.pth'
+backend_args = None
+custom_hooks = [
+    dict(after_iter=True, type='EmptyCacheHook'),
+]
+data_root = 'data'
+dataset_type = 'PointCloud3DGroundingDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=1, max_keep_ckpts=3, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'))
+default_scope = 'embodiedscan'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl', port=22873),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+find_unused_parameters = True
+launcher = 'slurm'
+
+log_level = 'INFO'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+lr = 0.0005
+metainfo = dict(classes='all')
+model = dict(
+    backbone_3d=dict(depth=34, in_channels=6, type='MinkResNet'),
+    bbox_head=dict(
+        contrastive_cfg=dict(bias=True, log_scale='auto', max_text_len=256),
+        decouple_bbox_loss=True,
+        decouple_groups=4,
+        decouple_weights=[
+            0.2,
+            0.2,
+            0.2,
+            0.4,
+        ],
+        loss_bbox=dict(
+            group='g8', loss_weight=1.0, mode='l1', type='BBoxCDLoss'),
+        loss_cls=dict(
+            alpha=0.25,
+            gamma=2.0,
+            loss_weight=1.0,
+            type='mmdet.FocalLoss',
+            use_sigmoid=True),
+        num_classes=256,
+        share_pred_layer=True,
+        sync_cls_avg_factor=True,
+        type='GroundingHead'),
+    coord_type='DEPTH',
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_size_divisor=32,
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='Det3DDataPreprocessor'),
+    decoder=dict(
+        layer_cfg=dict(
+            cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8),
+            cross_attn_text_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0),
+            self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)),
+        num_layers=6,
+        post_norm_cfg=None,
+        return_intermediate=True),
+    neck_3d=dict(
+        in_channels=[
+            64,
+            128,
+            256,
+            512,
+        ],
+        num_classes=1,
+        out_channels=256,
+        pts_prune_threshold=1000,
+        type='MinkNeck',
+        voxel_size=0.01),
+    num_queries=256,
+    test_cfg=None,
+    train_cfg=dict(
+        assigner=dict(
+            match_costs=[
+                dict(type='BinaryFocalLossCost', weight=1.0),
+                dict(type='BBox3DL1Cost', weight=2.0),
+                dict(type='IoU3DCost', weight=2.0),
+            ],
+            type='HungarianAssigner3D')),
+    type='SparseFeatureFusion3DGrounderMod',
+    use_xyz_feat=True,
+    voxel_size=0.01)
+n_points = 100000
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=10, norm_type=2),
+    optimizer=dict(lr=0.0005, type='AdamW', weight_decay=0.0005),
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            decoder=dict(decay_mult=1.0, lr_mult=0.1),
+            text_encoder=dict(lr_mult=0.0))),
+    type='OptimWrapper')
+param_scheduler = dict(
+    begin=0,
+    by_epoch=True,
+    end=12,
+    gamma=0.1,
+    milestones=[
+        8,
+        11,
+    ],
+    type='MultiStepLR')
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=24,
+    dataset=dict(
+        ann_file='embodiedscan_infos_val.pkl',
+        box_type_3d='Euler-Depth',
+        data_root='data',
+        filter_empty_gt=True,
+        metainfo=dict(classes='all'),
+        pipeline=[
+            dict(type='LoadAnnotations3D'),
+            dict(type='DefaultPipeline'),
+            dict(num_points=100000, type='PointSample'),
+            dict(
+                keys=[
+                    'points',
+                    'gt_bboxes_3d',
+                    'gt_labels_3d',
+                ],
+                type='Pack3DDetInputs'),
+        ],
+        test_mode=True,
+        tokens_positive_rebuild=True,
+        type='MMScanPointCloud3DGroundingDataset',
+        vg_file=
+        ''),
+    drop_last=False,
+    num_workers=12,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(type='GroundingMetricMod')
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(type='DefaultPipeline'),
+    dict(num_points=100000, type='PointSample'),
+    dict(
+        keys=[
+            'points',
+            'gt_bboxes_3d',
+            'gt_labels_3d',
+        ],
+        type='Pack3DDetInputs'),
+]
+train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=3)
+train_dataloader = dict(
+    batch_size=24,
+    dataset=dict(
+        dataset=dict(
+            ann_file='embodiedscan_infos_train.pkl',
+            box_type_3d='Euler-Depth',
+            data_root='data',
+            filter_empty_gt=True,
+            metainfo=dict(classes='all'),
+            pipeline=[
+                dict(type='LoadAnnotations3D'),
+                dict(type='DefaultPipeline'),
+                dict(num_points=100000, type='PointSample'),
+                dict(
+                    rot_range=[
+                        -0.087266,
+                        0.087266,
+                    ],
+                    scale_ratio_range=[
+                        0.9,
+                        1.1,
+                    ],
+                    shift_height=False,
+                    translation_std=[
+                        0.1,
+                        0.1,
+                        0.1,
+                    ],
+                    type='GlobalRotScaleTrans'),
+                dict(
+                    keys=[
+                        'points',
+                        'gt_bboxes_3d',
+                        'gt_labels_3d',
+                    ],
+                    type='Pack3DDetInputs'),
+            ],
+            test_mode=False,
+            tokens_positive_rebuild=True,
+            type='MMScanPointCloud3DGroundingDataset',
+            vg_file=
+            ''
+        ),
+        times=1,
+        type='RepeatDataset'),
+    num_workers=12,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(type='DefaultPipeline'),
+    dict(num_points=100000, type='PointSample'),
+    dict(
+        rot_range=[
+            -0.087266,
+            0.087266,
+        ],
+        scale_ratio_range=[
+            0.9,
+            1.1,
+        ],
+        shift_height=False,
+        translation_std=[
+            0.1,
+            0.1,
+            0.1,
+        ],
+        type='GlobalRotScaleTrans'),
+    dict(
+        keys=[
+            'points',
+            'gt_bboxes_3d',
+            'gt_labels_3d',
+        ],
+        type='Pack3DDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=24,
+    dataset=dict(
+        ann_file='embodiedscan_infos_val.pkl',
+        box_type_3d='Euler-Depth',
+        data_root='data',
+        filter_empty_gt=True,
+        metainfo=dict(classes='all'),
+        pipeline=[
+            dict(type='LoadAnnotations3D'),
+            dict(type='DefaultPipeline'),
+            dict(num_points=100000, type='PointSample'),
+            dict(
+                keys=[
+                    'points',
+                    'gt_bboxes_3d',
+                    'gt_labels_3d',
+                ],
+                type='Pack3DDetInputs'),
+        ],
+        test_mode=True,
+        tokens_positive_rebuild=True,
+        type='MMScanPointCloud3DGroundingDataset',
+        vg_file=
+        ''),
+    drop_last=False,
+    num_workers=12,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(type='GroundingMetricMod')
+work_dir = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-mmscan-grounding-20Per-100queries-load'
diff --git a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-100-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-100-5-vg-9dof.py
deleted file mode 100644
index 1b147a5..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-100-5-vg-9dof.py
+++ /dev/null
@@ -1,199 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-50-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-50-5-vg-9dof.py
deleted file mode 100644
index efabefc..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-50-5-vg-9dof.py
+++ /dev/null
@@ -1,199 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_50Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-75-5-vg-9dof.py b/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-75-5-vg-9dof.py
deleted file mode 100644
index 703148f..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_8xb12-mmscan-75-5-vg-9dof.py
+++ /dev/null
@@ -1,199 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-
-backend_args = None
-# Uncomment the following if use ceph or other file clients.
-# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
-# for more details.
-# file_client_args = dict(
-#     backend='petrel',
-#     path_mapping=dict({
-#         './data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/',
-#         'data/scannet/':
-#         's3://openmmlab/datasets/detection3d/scannet_processed/'
-#     }))
-
-metainfo = dict(classes='all')
-
-model = dict(
-    type='SparseFeatureFusion3DGrounderMod',
-    num_queries=100,
-    voxel_size=0.01,
-    data_preprocessor=dict(type='Det3DDataPreprocessor',
-                           mean=[123.675, 116.28, 103.53],
-                           std=[58.395, 57.12, 57.375],
-                           bgr_to_rgb=True,
-                           pad_size_divisor=32),
-    # backbone=dict(
-    #     type='mmdet.ResNet',
-    #     depth=50,
-    #     base_channels=16,  # to make it consistent with mink resnet
-    #     num_stages=4,
-    #     out_indices=(0, 1, 2, 3),
-    #     frozen_stages=1,
-    #     norm_cfg=dict(type='BN', requires_grad=False),
-    #     norm_eval=True,
-    #     init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
-    #     style='pytorch'),
-    backbone_3d=dict(type='MinkResNet', in_channels=6, depth=34),
-    use_xyz_feat=True,
-    # change due to no img feature fusion
-    neck_3d=dict(type='MinkNeck',
-                 num_classes=1,
-                 in_channels=[64, 128, 256, 512],
-                 out_channels=256,
-                 voxel_size=0.01,
-                 pts_prune_threshold=1000),
-    decoder=dict(
-        num_layers=6,
-        return_intermediate=True,
-        layer_cfg=dict(
-            # query self attention layer
-            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to text
-            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            # cross attention layer query to image
-            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
-            ffn_cfg=dict(embed_dims=256,
-                         feedforward_channels=2048,
-                         ffn_drop=0.0)),
-        post_norm_cfg=None),
-    bbox_head=dict(type='GroundingHead',
-                   num_classes=256,
-                   sync_cls_avg_factor=True,
-                   decouple_bbox_loss=True,
-                   decouple_groups=4,
-                   share_pred_layer=True,
-                   decouple_weights=[0.2, 0.2, 0.2, 0.4],
-                   contrastive_cfg=dict(max_text_len=256,
-                                        log_scale='auto',
-                                        bias=True),
-                   loss_cls=dict(type='mmdet.FocalLoss',
-                                 use_sigmoid=True,
-                                 gamma=2.0,
-                                 alpha=0.25,
-                                 loss_weight=1.0),
-                   loss_bbox=dict(type='BBoxCDLoss',
-                                  mode='l1',
-                                  loss_weight=1.0,
-                                  group='g8')),
-    coord_type='DEPTH',
-    # training and testing settings
-    train_cfg=dict(assigner=dict(type='HungarianAssigner3D',
-                                 match_costs=[
-                                     dict(type='BinaryFocalLossCost',
-                                          weight=1.0),
-                                     dict(type='BBox3DL1Cost', weight=2.0),
-                                     dict(type='IoU3DCost', weight=2.0)
-                                 ]), ),
-    test_cfg=None)
-
-dataset_type = 'PointCloud3DGroundingDataset'
-data_root = 'data'
-
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='GlobalRotScaleTrans',
-         rot_range=[-0.087266, 0.087266],
-         scale_ratio_range=[.9, 1.1],
-         translation_std=[.1, .1, .1],
-         shift_height=False),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='PointCloudPipeline'),
-    dict(type='PointSample', num_points=n_points),
-    dict(type='Pack3DDetInputs',
-         keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
-]
-# TODO: to determine a reasonable batch size
-train_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type='RepeatDataset',
-        times=1,
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='embodiedscan_infos_train.pkl',
-            vg_file=
-            'es_gen_text/vg_full/VG_train_75Percent_flattened_token_positive.json',
-            metainfo=metainfo,
-            pipeline=train_pipeline,
-            test_mode=False,
-            filter_empty_gt=True,
-            box_type_3d='Euler-Depth',
-            tokens_positive_rebuild=True)))
-
-val_dataloader = dict(
-    batch_size=12,
-    num_workers=12,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='embodiedscan_infos_val.pkl',
-        vg_file=
-        'es_gen_text/vg_full/VG_val_5Percent_flattened_token_positive.json',
-        #    vg_file='embodiedscan_val_mini_vg.json',
-        metainfo=metainfo,
-        pipeline=test_pipeline,
-        test_mode=True,
-        filter_empty_gt=True,
-        box_type_3d='Euler-Depth',
-        tokens_positive_rebuild=True))
-test_dataloader = val_dataloader
-
-val_evaluator = dict(type='GroundingMetricMod')
-test_evaluator = val_evaluator
-
-# training schedule for 1x
-train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=3)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-
-# optimizer
-lr = 5e-4
-optim_wrapper = dict(type='OptimWrapper',
-                     optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
-                     paramwise_cfg=dict(
-                         custom_keys={
-                             'text_encoder': dict(lr_mult=0.0),
-                             'decoder': dict(lr_mult=0.1, decay_mult=1.0)
-                         }),
-                     clip_grad=dict(max_norm=10, norm_type=2))
-
-# learning rate
-param_scheduler = dict(type='MultiStepLR',
-                       begin=0,
-                       end=12,
-                       by_epoch=True,
-                       milestones=[8, 11],
-                       gamma=0.1)
-
-custom_hooks = [dict(type='EmptyCacheHook', after_iter=True)]
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=3))
-
-# vis_backends = [
-#     dict(type='TensorboardVisBackend'),
-#     dict(type='LocalVisBackend')
-# ]
-# visualizer = dict(
-#     type='Det3DLocalVisualizer',
-#     vis_backends=vis_backends, name='visualizer')
-
-find_unused_parameters = True
-# load_from = '/mnt/petrelfs/lvruiyuan/repos/EmbodiedScan/work_dirs/pcd-esmod-grounding/epoch_12.pth'  # noqa
diff --git a/models/EmbodiedScan/configs/grounding/pcd_vg_mmscan.py b/models/EmbodiedScan/configs/grounding/pcd_vg_mmscan.py
deleted file mode 100644
index 81f328d..0000000
--- a/models/EmbodiedScan/configs/grounding/pcd_vg_mmscan.py
+++ /dev/null
@@ -1,256 +0,0 @@
-_base_ = ['../default_runtime.py']
-n_points = 100000
-backend_args = None
-custom_hooks = [
-    dict(after_iter=True, type='EmptyCacheHook'),
-]
-data_root = 'data'
-dataset_type = 'MMScanPointCloud3DGroundingDataset'
-default_hooks = dict(checkpoint=dict(interval=1,
-                                     max_keep_ckpts=3,
-                                     type='CheckpointHook'),
-                     logger=dict(interval=50, type='LoggerHook'),
-                     param_scheduler=dict(type='ParamSchedulerHook'),
-                     sampler_seed=dict(type='DistSamplerSeedHook'),
-                     timer=dict(type='IterTimerHook'))
-default_scope = 'embodiedscan'
-# env_cfg = dict(
-#     cudnn_benchmark=False,
-#     dist_cfg=dict(backend='nccl', port=25940),
-#     mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
-find_unused_parameters = True
-
-load_from = None
-log_level = 'INFO'
-log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
-lr = 0.0005
-metainfo = dict(classes='all')
-model = dict(backbone_3d=dict(depth=34, in_channels=6, type='MinkResNet'),
-             bbox_head=dict(contrastive_cfg=dict(bias=True,
-                                                 log_scale='auto',
-                                                 max_text_len=256),
-                            decouple_bbox_loss=True,
-                            decouple_groups=4,
-                            decouple_weights=[
-                                0.2,
-                                0.2,
-                                0.2,
-                                0.4,
-                            ],
-                            loss_bbox=dict(group='g8',
-                                           loss_weight=1.0,
-                                           mode='l1',
-                                           type='BBoxCDLoss'),
-                            loss_cls=dict(alpha=0.25,
-                                          gamma=2.0,
-                                          loss_weight=1.0,
-                                          type='mmdet.FocalLoss',
-                                          use_sigmoid=True),
-                            num_classes=256,
-                            share_pred_layer=True,
-                            sync_cls_avg_factor=True,
-                            type='GroundingHead'),
-             coord_type='DEPTH',
-             data_preprocessor=dict(bgr_to_rgb=True,
-                                    mean=[
-                                        123.675,
-                                        116.28,
-                                        103.53,
-                                    ],
-                                    pad_size_divisor=32,
-                                    std=[
-                                        58.395,
-                                        57.12,
-                                        57.375,
-                                    ],
-                                    type='Det3DDataPreprocessor'),
-             decoder=dict(layer_cfg=dict(
-                 cross_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8),
-                 cross_attn_text_cfg=dict(dropout=0.0,
-                                          embed_dims=256,
-                                          num_heads=8),
-                 ffn_cfg=dict(embed_dims=256,
-                              feedforward_channels=2048,
-                              ffn_drop=0.0),
-                 self_attn_cfg=dict(dropout=0.0, embed_dims=256, num_heads=8)),
-                          num_layers=6,
-                          post_norm_cfg=None,
-                          return_intermediate=True),
-             neck_3d=dict(in_channels=[
-                 64,
-                 128,
-                 256,
-                 512,
-             ],
-                          num_classes=1,
-                          out_channels=256,
-                          pts_prune_threshold=1000,
-                          type='MinkNeck',
-                          voxel_size=0.01),
-             num_queries=100,
-             test_cfg=None,
-             train_cfg=dict(assigner=dict(match_costs=[
-                 dict(type='BinaryFocalLossCost', weight=1.0),
-                 dict(type='BBox3DL1Cost', weight=2.0),
-                 dict(type='IoU3DCost', weight=2.0),
-             ],
-                                          type='HungarianAssigner3D')),
-             type='SparseFeatureFusion3DGrounderMod',
-             use_xyz_feat=True,
-             voxel_size=0.01)
-n_points = 100000
-optim_wrapper = dict(
-    clip_grad=dict(max_norm=10, norm_type=2),
-    optimizer=dict(lr=0.0005, type='AdamW', weight_decay=0.0005),
-    paramwise_cfg=dict(
-        custom_keys=dict(decoder=dict(decay_mult=1.0, lr_mult=0.1),
-                         text_encoder=dict(lr_mult=0.0))),
-    type='OptimWrapper')
-param_scheduler = dict(begin=0,
-                       by_epoch=True,
-                       end=12,
-                       gamma=0.1,
-                       milestones=[
-                           8,
-                           11,
-                       ],
-                       type='MultiStepLR')
-resume = False
-test_cfg = dict(type='TestLoop')
-test_dataloader = dict(batch_size=24,
-                       dataset=dict(ann_file='',
-                                    box_type_3d='Euler-Depth',
-                                    data_root='data',
-                                    filter_empty_gt=True,
-                                    metainfo=dict(classes='all'),
-                                    pipeline=[
-                                        dict(type='LoadAnnotations3D'),
-                                        dict(type='DefaultPipeline'),
-                                        dict(num_points=100000,
-                                             type='PointSample'),
-                                        dict(keys=[
-                                            'points',
-                                            'gt_bboxes_3d',
-                                            'gt_labels_3d',
-                                        ],
-                                             type='Pack3DDetInputs'),
-                                    ],
-                                    test_mode=True,
-                                    tokens_positive_rebuild=True,
-                                    type='MMScanPointCloud3DGroundingDataset',
-                                    vg_file=''),
-                       drop_last=False,
-                       num_workers=4,
-                       persistent_workers=True,
-                       sampler=dict(shuffle=False, type='DefaultSampler'))
-test_evaluator = dict(type='GroundingMetricMod')
-test_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='DefaultPipeline'),
-    dict(num_points=100000, type='PointSample'),
-    dict(keys=[
-        'points',
-        'gt_bboxes_3d',
-        'gt_labels_3d',
-    ],
-         type='Pack3DDetInputs'),
-]
-
-train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=12)
-train_dataloader = dict(batch_size=24,
-                        dataset=dict(dataset=dict(
-                            ann_file='',
-                            box_type_3d='Euler-Depth',
-                            data_root='data',
-                            filter_empty_gt=True,
-                            metainfo=dict(classes='all'),
-                            pipeline=[
-                                dict(type='LoadAnnotations3D'),
-                                dict(type='DefaultPipeline'),
-                                dict(num_points=100000, type='PointSample'),
-                                dict(rot_range=[
-                                    -0.087266,
-                                    0.087266,
-                                ],
-                                     scale_ratio_range=[
-                                         0.9,
-                                         1.1,
-                                     ],
-                                     shift_height=False,
-                                     translation_std=[
-                                         0.1,
-                                         0.1,
-                                         0.1,
-                                     ],
-                                     type='GlobalRotScaleTrans'),
-                                dict(keys=[
-                                    'points',
-                                    'gt_bboxes_3d',
-                                    'gt_labels_3d',
-                                ],
-                                     type='Pack3DDetInputs'),
-                            ],
-                            test_mode=False,
-                            tokens_positive_rebuild=True,
-                            type='MMScanPointCloud3DGroundingDataset',
-                            vg_file=''),
-                                     times=1,
-                                     type='RepeatDataset'),
-                        num_workers=4,
-                        persistent_workers=True,
-                        sampler=dict(shuffle=True, type='DefaultSampler'))
-train_pipeline = [
-    dict(type='LoadAnnotations3D'),
-    dict(type='DefaultPipeline'),
-    dict(num_points=100000, type='PointSample'),
-    dict(rot_range=[
-        -0.087266,
-        0.087266,
-    ],
-         scale_ratio_range=[
-             0.9,
-             1.1,
-         ],
-         shift_height=False,
-         translation_std=[
-             0.1,
-             0.1,
-             0.1,
-         ],
-         type='GlobalRotScaleTrans'),
-    dict(keys=[
-        'points',
-        'gt_bboxes_3d',
-        'gt_labels_3d',
-    ],
-         type='Pack3DDetInputs'),
-]
-val_cfg = dict(type='ValLoop')
-val_dataloader = dict(batch_size=24,
-                      dataset=dict(ann_file='',
-                                   box_type_3d='Euler-Depth',
-                                   data_root='data',
-                                   filter_empty_gt=True,
-                                   metainfo=dict(classes='all'),
-                                   pipeline=[
-                                       dict(type='LoadAnnotations3D'),
-                                       dict(type='DefaultPipeline'),
-                                       dict(num_points=100000,
-                                            type='PointSample'),
-                                       dict(keys=[
-                                           'points',
-                                           'gt_bboxes_3d',
-                                           'gt_labels_3d',
-                                       ],
-                                            type='Pack3DDetInputs'),
-                                   ],
-                                   test_mode=True,
-                                   tokens_positive_rebuild=True,
-                                   type='MMScanPointCloud3DGroundingDataset',
-                                   vg_file=' '),
-                      drop_last=False,
-                      num_workers=4,
-                      persistent_workers=True,
-                      sampler=dict(shuffle=False, type='DefaultSampler'))
-val_evaluator = dict(type='GroundingMetricMod')
-work_dir = 'exps/MMScan-VG-1030'
diff --git a/models/EmbodiedScan/embodiedscan/datasets/mmscan_dataset.py b/models/EmbodiedScan/embodiedscan/datasets/mmscan_dataset.py
index fc9a438..eee72a2 100644
--- a/models/EmbodiedScan/embodiedscan/datasets/mmscan_dataset.py
+++ b/models/EmbodiedScan/embodiedscan/datasets/mmscan_dataset.py
@@ -149,7 +149,7 @@ def __init__(self,
         self.mmscan_loader = MMScan(version='v1',
                                     split='val' if test_mode else 'train',
                                     task='MMScan-VG',
-                                    ratio=0.1)
+                                    ratio=0.2)
 
         if 'classes' in metainfo:
             if metainfo['classes'] == 'all':
diff --git a/models/README.md b/models/README.md
index 109231b..5309e7b 100644
--- a/models/README.md
+++ b/models/README.md
@@ -1,4 +1,6 @@
-## Visual Grounding Models
+## 3D Visual Grounding Models
+
+These are 3D visual grounding models adapted for the mmscan-devkit. Currently, two models have been released: EmbodiedScan and ScanRefer.
 
 ### Scanrefer
 
@@ -22,31 +24,33 @@
 
 ### EmbodiedScan
 
-1. Follow the [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan/blob/main/README.md) to setup the Env. You need not load the datasets!
+1. Follow the [EmbodiedScan](https://github.com/OpenRobotLab/EmbodiedScan/blob/main/README.md) to setup the Env. Download the [Multi-View 3D Detection model's weights](https://download.openmmlab.com/mim-example/embodiedscan/mv-3ddet.pth) and change the "load_from" path in the config file under `configs/grounding` to the path where the weights are saved.
 
 2. Install MMScan API.
 
-3. Run the following command to train Scanrefer (multiple GPU):
+3. Run the following command to train EmbodiedScan (multiple GPU):
 
    ```bash
    # Single GPU training
-   python tools/train.py configs/grounding/pcd_vg_mmscan.py --work-dir=path/to/save
+   python tools/train.py configs/grounding/pcd_4xb24_mmscan_vg_num256.py --work-dir=path/to/save
 
    # Multiple GPU training
-   python tools/train.py configs/grounding/pcd_vg_mmscan.py --work-dir=path/to/save --launcher="pytorch"
+   python tools/train.py configs/grounding/pcd_4xb24_mmscan_vg_num256.py --work-dir=path/to/save --launcher="pytorch"
    ```
 
-4. Run the following command to evaluate Scanrefer (multiple GPU):
+4. Run the following command to evaluate EmbodiedScan (multiple GPU):
 
    ```bash
    # Single GPU testing
-   python tools/test.py configs/grounding/pcd_vg_mmscan.py path/to/load_pth
+   python tools/test.py configs/grounding/pcd_4xb24_mmscan_vg_num256.py path/to/load_pth
 
    # Multiple GPU testing
-   python tools/test.py configs/grounding/pcd_vg_mmscan.py path/to/load_pth --launcher="pytorch"
+   python tools/test.py configs/grounding/pcd_4xb24_mmscan_vg_num256.py path/to/load_pth --launcher="pytorch"
    ```
 
-## Question Answering Models
+## 3D Question Answering Models
+
+These are 3D question answering models adapted for the mmscan-devkit. Currently, two models have been released: LL3DA and LEO.
 
 ### LL3DA
 
@@ -79,7 +83,6 @@
    python eval_utils/evaluate_gpt.py --file path/to/qa_pred_gt_val.json
    --tmp_path path/to/tmp  --api_key your_api_key --eval_size -1
    --nproc 4
-
    ```
 
 ### LEO
@@ -113,7 +116,6 @@
    python evaluator/GPT_eval.py --file path/to/test_embodied_scan_l_complete.json
    --tmp_path path/to/tmp  --api_key your_api_key --eval_size -1
    --nproc 4
-
    ```
 
 PS : It is possible that LEO may encounter an "NaN" error in the MultiHeadAttentionSpatial module due to the training setup when training more epoches. ( no problem for 4GPU one epoch)
diff --git a/models/Scanrefer/scripts/train.py b/models/Scanrefer/scripts/train.py
index 6dde77f..0443bf4 100644
--- a/models/Scanrefer/scripts/train.py
+++ b/models/Scanrefer/scripts/train.py
@@ -240,7 +240,7 @@ def train(args):
     # }
 
     # dataloader
-    train_dataset, train_dataloader = get_dataloader(args, 'train', DC, True)
+    train_dataset, train_dataloader = get_dataloader(args, 'train', DC, False)
     val_dataset, val_dataloader = get_dataloader(args, 'val', DC, False)
     dataloader = {'train': train_dataloader, 'val': val_dataloader}