modelscope · HYLcool · Jan 22, 2025 · Dec 24, 2024 · Dec 24, 2024 · Dec 25, 2024
diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
@@ -21,7 +21,7 @@ services:
     ports:
       - "6379:6379"
       - "8265:8265"
-    shm_size: "64G"
+    shm_size: "128G"
     deploy:
       resources:
         reservations:
@@ -49,7 +49,7 @@ services:
       - ray-head
     networks:
       - ray-network
-    shm_size: "64G"
+    shm_size: "128G"
     deploy:
       resources:
         reservations:

diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
@@ -44,7 +44,7 @@ jobs:
     - name: Run unittest standalone
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
-        docker compose exec ray-head python tests/run.py --tag standalone
+        docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone
 
     - name: Run unittest ray
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
diff --git a/data_juicer/core/adapter.py b/data_juicer/core/adapter.py
@@ -21,7 +21,8 @@ def __init__(self, cfg: dict):
         self.cfg = cfg
 
         # insight mining related
-        self.enable_insight_mining = self.cfg.open_insight_mining
+        self.enable_insight_mining = self.cfg.open_insight_mining if hasattr(
+            self.cfg, 'open_insight_mining') else False
 
         # resource probe related
         self.idle_resources = Monitor.monitor_current_resources()

diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py
@@ -114,7 +114,7 @@ def run(self,
                         f'[{self.cfg.fusion_strategy}]...')
             ops = fuse_operators(ops, probe_res)
 
-        # 2. stats precompute only for filter ops
+        # 2. stats precompute only for filter or tagging ops
         logger.info('Computing the stats of dataset...')
         stats_collected = False
         for op in ops:
@@ -147,7 +147,7 @@ def run(self,
             compress(dataset)
 
         # 4. analysis and output result to the export path
-        # 4.1. Only consider fields in Fields.stats
+        # 4.1. Only consider fields in Fields.stats and Fields.meta
         # 4.2. For string fields, only consider its histogram
         # 4.3. For numeric fields, consider its histogram and box
         # 4.4. Otherwise, DO NOT analyze

diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -108,7 +108,7 @@ def wrapper(sample, *args, **kwargs):
                 else:
                     return [res]
             except Exception as e:
-                if skip_op_error:
+                if not skip_op_error:
                     raise
                 from loguru import logger
                 logger.error(f'An error occurred in {op_name} when processing '

diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py
@@ -118,6 +118,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -140,10 +143,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
         self.seed_qa_samples = self._load_seed_qa_samples()
         if len(self.seed_qa_samples) == 0:
             raise ValueError('No QA data was parsed from the seed file!')

diff --git a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py
@@ -85,6 +85,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -107,10 +110,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
     def parse_output(self, raw_output):
         logger.debug(raw_output)
         qa_list = []

diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py
@@ -77,6 +77,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -99,10 +102,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
     def build_input(self, sample):
         qa_pair = self.qa_pair_template.format(sample[self.query_key],
                                                sample[self.response_key])

diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
@@ -68,6 +68,12 @@
     'FastSAM-x.pt',
 }
 
+TORCH_DTYPE_MAPPING = {
+    'fp32': torch.float32,
+    'fp16': torch.float16,
+    'bf16': torch.bfloat16,
+}
+
 
 def get_backup_model_link(model_name):
     for pattern, url in BACKUP_MODEL_LINKS.items():
@@ -291,8 +297,12 @@ def prepare_diffusion_model(pretrained_model_name_or_path, diffusion_type,
     """
     AUTOINSTALL.check(['torch', 'transformers'])
 
-    if 'device' in model_params:
-        model_params['device_map'] = model_params.pop('device')
+    device = model_params.pop('device', None)
+    if not device:
+        model_params['device_map'] = 'balanced'
+    if 'torch_dtype' in model_params:
+        model_params['torch_dtype'] = TORCH_DTYPE_MAPPING[
+            model_params['torch_dtype']]
 
     diffusion_type_to_pipeline = {
         'image2image': diffusers.AutoPipelineForImage2Image,
@@ -309,6 +319,8 @@ def prepare_diffusion_model(pretrained_model_name_or_path, diffusion_type,
     pipeline = diffusion_type_to_pipeline[diffusion_type]
     model = pipeline.from_pretrained(pretrained_model_name_or_path,
                                      **model_params)
+    if device:
+        model = model.to(device)
 
     return model
 

diff --git a/environments/science_requires.txt b/environments/science_requires.txt
@@ -24,7 +24,7 @@ simple-aesthetics-predictor
 scenedetect[opencv]
 ffmpeg-python
 opencv-python
-vllm>=0.1.3
+vllm>=0.6.5
 rouge
 dashscope
 openai

diff --git a/tests/ops/aggregator/test_entity_attribute_aggregator.py b/tests/ops/aggregator/test_entity_attribute_aggregator.py
@@ -4,11 +4,10 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.aggregator import EntityAttributeAggregator
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, BatchMetaKeys, MetaKeys
 
 
-@SKIPPED_TESTS.register_module()
 class EntityAttributeAggregatorTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples, output_key=BatchMetaKeys.entity_attribute):
@@ -24,7 +23,7 @@ def _run_helper(self, op, samples, output_key=BatchMetaKeys.entity_attribute):
             for k in data:
                 logger.info(f"{k}: {data[k]}")
             self.assertIn(output_key, data[Fields.batch_meta])
-            self.assertNotEqual(data[Fields.batch_met][output_key], '')
+            self.assertNotEqual(data[Fields.batch_meta][output_key], '')
 
         self.assertEqual(len(new_dataset), len(samples))
 

diff --git a/tests/ops/aggregator/test_meta_tags_aggregator.py b/tests/ops/aggregator/test_meta_tags_aggregator.py
@@ -5,10 +5,8 @@
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.aggregator import MetaTagsAggregator
 from data_juicer.utils.constant import Fields, MetaKeys
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-
-@SKIPPED_TESTS.register_module()
 class MetaTagsAggregatorTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples):

diff --git a/tests/ops/aggregator/test_most_relavant_entities_aggregator.py b/tests/ops/aggregator/test_most_relavant_entities_aggregator.py
@@ -4,12 +4,10 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.aggregator import MostRelavantEntitiesAggregator
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 from data_juicer.utils.constant import Fields, BatchMetaKeys, MetaKeys
 
-
-@SKIPPED_TESTS.register_module()
 class MostRelavantEntitiesAggregatorTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples, output_key=BatchMetaKeys.most_relavant_entities):

diff --git a/tests/ops/aggregator/test_nested_aggregator.py b/tests/ops/aggregator/test_nested_aggregator.py
@@ -4,12 +4,10 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.aggregator import NestedAggregator
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 from data_juicer.utils.constant import Fields, MetaKeys
 
-
-@SKIPPED_TESTS.register_module()
 class NestedAggregatorTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples, output_key=MetaKeys.event_description):

diff --git a/tests/ops/filter/test_video_motion_score_raft_filter.py b/tests/ops/filter/test_video_motion_score_raft_filter.py
@@ -6,11 +6,7 @@
 from data_juicer.ops.filter.video_motion_score_raft_filter import \
     VideoMotionScoreRaftFilter
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
-
-# skip due to conflicts when run lazy_load in multiprocessing in librosa
-# tests passed locally.
-@SKIPPED_TESTS.register_module()
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 class VideoMotionScoreRaftFilterTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
@@ -133,7 +129,7 @@ def test_middle(self):
             'videos': [self.vid3_path]
         }]
         tgt_list = [{'videos': [self.vid2_path]}]
-        op = VideoMotionScoreRaftFilter(min_score=3, max_score=10.5)
+        op = VideoMotionScoreRaftFilter(min_score=3, max_score=10.2)
         self._run_helper(op, ds_list, tgt_list)
 
     def test_any(self):
@@ -150,7 +146,7 @@ def test_any(self):
             'videos': [self.vid2_path, self.vid3_path]
         }]
         op = VideoMotionScoreRaftFilter(min_score=3,
-                                    max_score=10.5,
+                                    max_score=10.2,
                                     any_or_all='any')
         self._run_helper(op, ds_list, tgt_list)
 
@@ -164,7 +160,7 @@ def test_all(self):
         }]
         tgt_list = []
         op = VideoMotionScoreRaftFilter(min_score=3,
-                                    max_score=10.5,
+                                    max_score=10.2,
                                     any_or_all='all')
         self._run_helper(op, ds_list, tgt_list)
 
@@ -180,7 +176,7 @@ def test_parallel(self):
             'videos': [self.vid3_path]
         }]
         tgt_list = [{'videos': [self.vid2_path]}]
-        op = VideoMotionScoreRaftFilter(min_score=3, max_score=10.5)
+        op = VideoMotionScoreRaftFilter(min_score=3, max_score=10.2)
         self._run_helper(op, ds_list, tgt_list, np=2)
 
 

diff --git a/tests/ops/mapper/test_calibrate_qa_mapper.py b/tests/ops/mapper/test_calibrate_qa_mapper.py
@@ -1,18 +1,10 @@
-import os
 import unittest
-from unittest.mock import Mock, patch
-
-import httpx
 from loguru import logger
 
 from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP because the API call is not configured yet.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class CalibrateQAMapperTest(DataJuicerTestCaseBase):
 
     def _run_op(self, op):
@@ -81,23 +73,5 @@ def test_args(self):
             response_path='choices.0.message.content')
         self._run_op(op)
 
-    @patch('httpx.Client.send')
-    def test_retry(self, mock_send):
-        mock_response = Mock()
-        mock_response.status_code = 408
-        mock_response.headers = {}
-        mock_request = Mock()
-        mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
-            '408 Client Error: Request Timeout',
-            request=mock_request,
-            response=mock_response)
-        mock_send.return_value = mock_response
-
-        with self.assertLogs(level='DEBUG') as cm:
-            op = CalibrateQAMapper(api_model='test',
-                                   model_params={'max_retries': 3})
-            op.process({'text': '', 'query': '', 'response': ''})
-        self.assertIn('3 retries left', '\n'.join(cm.output))
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/ops/mapper/test_calibrate_query_mapper.py b/tests/ops/mapper/test_calibrate_query_mapper.py
@@ -3,13 +3,8 @@
 from loguru import logger
 
 from data_juicer.ops.mapper.calibrate_query_mapper import CalibrateQueryMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-
-# Skip tests for this OP because the API call is not configured yet.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class CalibrateQueryMapperTest(DataJuicerTestCaseBase):
 
     def _run_op(self, api_model, response_path=None):

diff --git a/tests/ops/mapper/test_calibrate_response_mapper.py b/tests/ops/mapper/test_calibrate_response_mapper.py
@@ -4,13 +4,9 @@
 
 from data_juicer.ops.mapper.calibrate_response_mapper import \
     CalibrateResponseMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP because the API call is not configured yet.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class CalibrateResponseMapperTest(DataJuicerTestCaseBase):
 
     def _run_op(self, api_model, response_path=None):

diff --git a/tests/ops/mapper/test_dialog_intent_detection_mapper.py b/tests/ops/mapper/test_dialog_intent_detection_mapper.py
@@ -4,13 +4,9 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.dialog_intent_detection_mapper import DialogIntentDetectionMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class TestDialogIntentDetectionMapper(DataJuicerTestCaseBase):
     # before runing this test, set below environment variables:
     # export OPENAI_API_URL=https://dashscope.aliyuncs.com/compatible-mode/v1

diff --git a/tests/ops/mapper/test_dialog_sentiment_detection_mapper.py b/tests/ops/mapper/test_dialog_sentiment_detection_mapper.py
@@ -5,13 +5,9 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.dialog_sentiment_detection_mapper import DialogSentimentDetectionMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class TestDialogSentimentDetectionMapper(DataJuicerTestCaseBase):
     # before runing this test, set below environment variables:
     # export OPENAI_API_URL=https://dashscope.aliyuncs.com/compatible-mode/v1