fix time cal

modelscope · pan-x-c · Sep 4, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
commit e85412715eecdaecf552e84c909145db90189284
diff --git a/data_juicer/core/ray_data.py b/data_juicer/core/ray_data.py
@@ -122,7 +122,7 @@ def split_jsonl(file_path: str, max_size: int,
 
             # Write the buffered lines to the current output file
             if buffer:
-                with open(output_file_path, 'a', encoding='utf-8') as outfile:
+                with open(output_file_path, 'w', encoding='utf-8') as outfile:
                     outfile.writelines(buffer)
                 buffer = []
                 buffer_size = 0
@@ -197,7 +197,7 @@ def best_file_num(cpu: int, memory: int, file_size: int) -> int:
     Returns:
         int: best number of files in a single batch
     """
-    max_files_by_memory = memory // (2 * file_size)
+    max_files_by_memory = memory // (16 * file_size)
 
     best_num_files = max(1, (max_files_by_memory // cpu)) * cpu
     logger.info(f'Best number of files in a single batch: {best_num_files}')
@@ -250,6 +250,7 @@ def read_jsonl(cls,
                                     DEFAULT_MAX_FILE_SIZE, cfg.work_dir)
         cpu = ray.cluster_resources().get('CPU', 0)
         memory = ray.cluster_resources().get('memory', 0) / 1024 / 1024
+        logger.info(f'CPU: {cpu}, Memory: {memory}')
         batch_file_num = best_file_num(cpu, memory, DEFAULT_MAX_FILE_SIZE)
         return RayDataset(datasets=load_splited_json_dataset(
             files, batch_file_num),

diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py
@@ -46,7 +46,7 @@ def run(self, load_data_np=None):
         """
         # 1. load data
         logger.info('Loading dataset with Ray...')
-
+        tstart = time.time()
         if self.cfg.get('generated_dataset_config', None):
             generated_dataset_config = self.cfg.generated_dataset_config
             assert isinstance(generated_dataset_config,
@@ -63,7 +63,6 @@ def run(self, load_data_np=None):
 
         # 3. data process
         logger.info('Processing data...')
-        tstart = time.time()
         dataset.process(ops)
 
         # 4. data export