From 20b1ffe9c692014e3bfba2b9a655a010eefa3d73 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Tue, 24 Dec 2024 17:01:41 +0800
Subject: [PATCH 01/26] * fix missing attribute error and duplicate test funcs

---
 data_juicer/core/adapter.py                | 3 ++-
 tests/ops/mapper/test_text_chunk_mapper.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/data_juicer/core/adapter.py b/data_juicer/core/adapter.py
index 64fd622f0..4b74aec1c 100644
--- a/data_juicer/core/adapter.py
+++ b/data_juicer/core/adapter.py
@@ -21,7 +21,8 @@ def __init__(self, cfg: dict):
         self.cfg = cfg
 
         # insight mining related
-        self.enable_insight_mining = self.cfg.open_insight_mining
+        self.enable_insight_mining = self.cfg.open_insight_mining if hasattr(
+            self.cfg, 'open_insight_mining') else False
 
         # resource probe related
         self.idle_resources = Monitor.monitor_current_resources()
diff --git a/tests/ops/mapper/test_text_chunk_mapper.py b/tests/ops/mapper/test_text_chunk_mapper.py
index 0c0a70db3..a412104c4 100644
--- a/tests/ops/mapper/test_text_chunk_mapper.py
+++ b/tests/ops/mapper/test_text_chunk_mapper.py
@@ -101,7 +101,7 @@ def test_max_len_text_chunk(self):
         op = TextChunkMapper(max_len=20, split_pattern=None)
         self._run_helper(op, source, target)
     
-    def test_max_len_text_chunk(self):
+    def test_max_len_text_chunk_overlap_len(self):
         source = [
             {
                 'text': "Today is Sunday and it's a happy day!"

From c57ca129de87f2781090bce3ea49c59c03f471e6 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Tue, 24 Dec 2024 17:46:34 +0800
Subject: [PATCH 02/26] * fix unexpected keyword argument error

---
 data_juicer/utils/model_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
index 94b4440eb..6e32434fa 100644
--- a/data_juicer/utils/model_utils.py
+++ b/data_juicer/utils/model_utils.py
@@ -613,6 +613,7 @@ def forward(
             output_attentions: Optional[bool] = None,
             output_hidden_states: Optional[bool] = None,
             return_dict: Optional[bool] = None,
+            interpolate_pos_encoding: bool = False,
         ) -> Union[tuple,
                    transformers.modeling_outputs.BaseModelOutputWithPooling]:
             """Flatten `pixel_values` along the batch and time dimension,
@@ -654,6 +655,7 @@ def forward(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=True,
+                interpolate_pos_encoding=interpolate_pos_encoding,
             )
 
             # now restore the original dimensions

From 38bfd06e821b6dcfb466196eeb7e42bf24951862 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Wed, 25 Dec 2024 11:11:45 +0800
Subject: [PATCH 03/26] * review for OOM tests

---
 tests/ops/mapper/test_image_captioning_mapper.py                | 2 +-
 tests/ops/mapper/test_image_diffusion_mapper.py                 | 2 +-
 tests/ops/mapper/test_image_tagging_mapper.py                   | 2 +-
 tests/ops/mapper/test_video_captioning_from_audio_mapper.py     | 2 +-
 tests/ops/mapper/test_video_captioning_from_frames_mapper.py    | 2 +-
 .../ops/mapper/test_video_captioning_from_summarizer_mapper.py  | 2 +-
 tests/ops/mapper/test_video_captioning_from_video_mapper.py     | 2 +-
 tests/ops/mapper/test_video_tagging_from_frames_mapper.py       | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/ops/mapper/test_image_captioning_mapper.py b/tests/ops/mapper/test_image_captioning_mapper.py
index 2a772ab20..a3c36ffd8 100644
--- a/tests/ops/mapper/test_image_captioning_mapper.py
+++ b/tests/ops/mapper/test_image_captioning_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class ImageCaptioningMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_image_diffusion_mapper.py b/tests/ops/mapper/test_image_diffusion_mapper.py
index 5883a7ff7..007f3b78b 100644
--- a/tests/ops/mapper/test_image_diffusion_mapper.py
+++ b/tests/ops/mapper/test_image_diffusion_mapper.py
@@ -12,7 +12,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class ImageDiffusionMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_image_tagging_mapper.py b/tests/ops/mapper/test_image_tagging_mapper.py
index d2bbddec2..5eaeea346 100644
--- a/tests/ops/mapper/test_image_tagging_mapper.py
+++ b/tests/ops/mapper/test_image_tagging_mapper.py
@@ -10,7 +10,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class ImageTaggingMapperTest(DataJuicerTestCaseBase):
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                              'data')
diff --git a/tests/ops/mapper/test_video_captioning_from_audio_mapper.py b/tests/ops/mapper/test_video_captioning_from_audio_mapper.py
index 402509639..a77aa64fa 100644
--- a/tests/ops/mapper/test_video_captioning_from_audio_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_audio_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromAudioMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
index d9bf29724..2d625c917 100644
--- a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromFramesMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py b/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py
index 016a4d73b..048bb1900 100644
--- a/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromSummarizerMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_captioning_from_video_mapper.py b/tests/ops/mapper/test_video_captioning_from_video_mapper.py
index f3de27226..a5db1b6db 100644
--- a/tests/ops/mapper/test_video_captioning_from_video_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_video_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromVideoMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_tagging_from_frames_mapper.py b/tests/ops/mapper/test_video_tagging_from_frames_mapper.py
index 31fc04c3b..fd2f4e16b 100644
--- a/tests/ops/mapper/test_video_tagging_from_frames_mapper.py
+++ b/tests/ops/mapper/test_video_tagging_from_frames_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class VideoTaggingFromFramesMapperTest(DataJuicerTestCaseBase):
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                              'data')

From 8fb34b37eee805fc2c005fcb3a337460a37b1d86 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Wed, 25 Dec 2024 14:12:30 +0800
Subject: [PATCH 04/26] * review for OOM tests

---
 tests/ops/mapper/test_image_captioning_mapper.py            | 2 +-
 tests/ops/mapper/test_image_diffusion_mapper.py             | 2 +-
 tests/ops/mapper/test_image_tagging_mapper.py               | 5 +----
 tests/ops/mapper/test_video_captioning_from_audio_mapper.py | 6 +-----
 .../ops/mapper/test_video_captioning_from_frames_mapper.py  | 2 +-
 .../mapper/test_video_captioning_from_summarizer_mapper.py  | 2 +-
 tests/ops/mapper/test_video_captioning_from_video_mapper.py | 2 +-
 tests/ops/mapper/test_video_tagging_from_frames_mapper.py   | 5 +----
 8 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/tests/ops/mapper/test_image_captioning_mapper.py b/tests/ops/mapper/test_image_captioning_mapper.py
index a3c36ffd8..2a772ab20 100644
--- a/tests/ops/mapper/test_image_captioning_mapper.py
+++ b/tests/ops/mapper/test_image_captioning_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
+@SKIPPED_TESTS.register_module()
 class ImageCaptioningMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_image_diffusion_mapper.py b/tests/ops/mapper/test_image_diffusion_mapper.py
index 007f3b78b..5883a7ff7 100644
--- a/tests/ops/mapper/test_image_diffusion_mapper.py
+++ b/tests/ops/mapper/test_image_diffusion_mapper.py
@@ -12,7 +12,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
+@SKIPPED_TESTS.register_module()
 class ImageDiffusionMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_image_tagging_mapper.py b/tests/ops/mapper/test_image_tagging_mapper.py
index 5eaeea346..20b65a8b3 100644
--- a/tests/ops/mapper/test_image_tagging_mapper.py
+++ b/tests/ops/mapper/test_image_tagging_mapper.py
@@ -6,11 +6,8 @@
 from data_juicer.ops.mapper.image_tagging_mapper import \
     ImageTaggingMapper
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-# Skip tests for this OP in the GitHub actions due to OOM on the current runner
-# These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
 class ImageTaggingMapperTest(DataJuicerTestCaseBase):
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                              'data')
diff --git a/tests/ops/mapper/test_video_captioning_from_audio_mapper.py b/tests/ops/mapper/test_video_captioning_from_audio_mapper.py
index a77aa64fa..966ae9761 100644
--- a/tests/ops/mapper/test_video_captioning_from_audio_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_audio_mapper.py
@@ -5,13 +5,9 @@
 from data_juicer.ops.mapper.video_captioning_from_audio_mapper import \
     VideoCaptioningFromAudioMapper
 from data_juicer.utils.mm_utils import SpecialTokens
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP in the GitHub actions due to OOM on the current runner
-# These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromAudioMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
index 2d625c917..d9bf29724 100644
--- a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
+@SKIPPED_TESTS.register_module()
 class VideoCaptioningFromFramesMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py b/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py
index 048bb1900..016a4d73b 100644
--- a/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
+@SKIPPED_TESTS.register_module()
 class VideoCaptioningFromSummarizerMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_captioning_from_video_mapper.py b/tests/ops/mapper/test_video_captioning_from_video_mapper.py
index a5db1b6db..f3de27226 100644
--- a/tests/ops/mapper/test_video_captioning_from_video_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_video_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
+@SKIPPED_TESTS.register_module()
 class VideoCaptioningFromVideoMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_tagging_from_frames_mapper.py b/tests/ops/mapper/test_video_tagging_from_frames_mapper.py
index fd2f4e16b..ea3c10b98 100644
--- a/tests/ops/mapper/test_video_tagging_from_frames_mapper.py
+++ b/tests/ops/mapper/test_video_tagging_from_frames_mapper.py
@@ -7,11 +7,8 @@
     VideoTaggingFromFramesMapper
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.mm_utils import SpecialTokens
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-# Skip tests for this OP in the GitHub actions due to OOM on the current runner
-# These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
 class VideoTaggingFromFramesMapperTest(DataJuicerTestCaseBase):
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                              'data')

From f63a80e737ea815baea4f801634e570f74c4fac4 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Wed, 25 Dec 2024 14:16:44 +0800
Subject: [PATCH 05/26] - use 2 np instead of 4 np for unittest

---
 tests/ops/mapper/test_image_captioning_mapper.py            | 6 +++---
 .../ops/mapper/test_video_captioning_from_frames_mapper.py  | 6 +++---
 tests/ops/mapper/test_video_captioning_from_video_mapper.py | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/ops/mapper/test_image_captioning_mapper.py b/tests/ops/mapper/test_image_captioning_mapper.py
index 2a772ab20..0c6c06f99 100644
--- a/tests/ops/mapper/test_image_captioning_mapper.py
+++ b/tests/ops/mapper/test_image_captioning_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class ImageCaptioningMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
@@ -134,7 +134,7 @@ def test_multi_process(self):
         op = ImageCaptioningMapper(hf_img2seq=self.hf_img2seq,
                                    caption_num=caption_num,
                                    keep_candidate_mode='random_any')
-        self._run_mapper(dataset, op, num_proc=4, caption_num=len(dataset) * 2)
+        self._run_mapper(dataset, op, num_proc=2, caption_num=len(dataset) * 2)
 
     def test_no_eoc_special_token_remove_original_sample(self):
 
@@ -236,7 +236,7 @@ def test_multi_process_remove_original_sample(self):
                                    caption_num=caption_num,
                                    keep_candidate_mode='random_any',
                                    keep_original_sample=False)
-        self._run_mapper(dataset, op, num_proc=4, caption_num=len(dataset))
+        self._run_mapper(dataset, op, num_proc=2, caption_num=len(dataset))
 
 
 if __name__ == '__main__':
diff --git a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
index d9bf29724..dec5e4e9b 100644
--- a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromFramesMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
@@ -149,7 +149,7 @@ def test_multi_process(self):
             'videos': [self.vid1_path]
         }] * 10
         op = VideoCaptioningFromFramesMapper(hf_img2seq=self.hf_img2seq)
-        self._run_mapper(ds_list, op, num_proc=4, caption_num=len(ds_list) * 2)
+        self._run_mapper(ds_list, op, num_proc=2, caption_num=len(ds_list) * 2)
 
     def test_multi_process_remove_original_sample(self):
         ds_list = [{
@@ -159,7 +159,7 @@ def test_multi_process_remove_original_sample(self):
 
         op = VideoCaptioningFromFramesMapper(hf_img2seq=self.hf_img2seq,
                                              keep_original_sample=False)
-        self._run_mapper(ds_list, op, num_proc=4, caption_num=len(ds_list))
+        self._run_mapper(ds_list, op, num_proc=2, caption_num=len(ds_list))
 
     def test_frame_sampling_method(self):
 
diff --git a/tests/ops/mapper/test_video_captioning_from_video_mapper.py b/tests/ops/mapper/test_video_captioning_from_video_mapper.py
index f3de27226..1a4a49fcf 100644
--- a/tests/ops/mapper/test_video_captioning_from_video_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_video_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to OOM on the current runner
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromVideoMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
@@ -149,7 +149,7 @@ def test_multi_process(self):
             'videos': [self.vid1_path]
         }] * 10
         op = VideoCaptioningFromVideoMapper(hf_video_blip=self.hf_video_blip)
-        self._run_mapper(ds_list, op, num_proc=4, caption_num=len(ds_list) * 2)
+        self._run_mapper(ds_list, op, num_proc=2, caption_num=len(ds_list) * 2)
 
     def test_multi_process_remove_original_sample(self):
         ds_list = [{
@@ -159,7 +159,7 @@ def test_multi_process_remove_original_sample(self):
 
         op = VideoCaptioningFromVideoMapper(hf_video_blip=self.hf_video_blip,
                                             keep_original_sample=False)
-        self._run_mapper(ds_list, op, num_proc=4, caption_num=len(ds_list))
+        self._run_mapper(ds_list, op, num_proc=2, caption_num=len(ds_list))
 
     def test_frame_sampling_method(self):
 

From 8a4e822e4cad924bc6ac591161a5107590a4fe72 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Wed, 25 Dec 2024 15:54:24 +0800
Subject: [PATCH 06/26] - use 2 np instead of 4 np for unittest to resolve OOM
 problem

---
 tests/ops/mapper/test_image_captioning_mapper.py            | 6 +-----
 .../ops/mapper/test_video_captioning_from_frames_mapper.py  | 6 +-----
 tests/ops/mapper/test_video_captioning_from_video_mapper.py | 6 +-----
 3 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/tests/ops/mapper/test_image_captioning_mapper.py b/tests/ops/mapper/test_image_captioning_mapper.py
index 0c6c06f99..9e56c32c4 100644
--- a/tests/ops/mapper/test_image_captioning_mapper.py
+++ b/tests/ops/mapper/test_image_captioning_mapper.py
@@ -5,13 +5,9 @@
 from data_juicer.ops.mapper.image_captioning_mapper import \
     ImageCaptioningMapper
 from data_juicer.utils.mm_utils import SpecialTokens
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP in the GitHub actions due to OOM on the current runner
-# These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
 class ImageCaptioningMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
index dec5e4e9b..9504c3662 100644
--- a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py
@@ -5,13 +5,9 @@
 from data_juicer.ops.mapper.video_captioning_from_frames_mapper import \
     VideoCaptioningFromFramesMapper
 from data_juicer.utils.mm_utils import SpecialTokens
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP in the GitHub actions due to OOM on the current runner
-# These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromFramesMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_captioning_from_video_mapper.py b/tests/ops/mapper/test_video_captioning_from_video_mapper.py
index 1a4a49fcf..fa3714e0a 100644
--- a/tests/ops/mapper/test_video_captioning_from_video_mapper.py
+++ b/tests/ops/mapper/test_video_captioning_from_video_mapper.py
@@ -5,13 +5,9 @@
 from data_juicer.ops.mapper.video_captioning_from_video_mapper import \
     VideoCaptioningFromVideoMapper
 from data_juicer.utils.mm_utils import SpecialTokens
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP in the GitHub actions due to OOM on the current runner
-# These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
 class VideoCaptioningFromVideoMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',

From 13460e543ea210b50965d652319a2ebd08d25a85 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Thu, 26 Dec 2024 11:28:09 +0800
Subject: [PATCH 07/26] * update doc of analyzer

---
 data_juicer/core/analyzer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py
index d9ac586e9..4f76e71bd 100644
--- a/data_juicer/core/analyzer.py
+++ b/data_juicer/core/analyzer.py
@@ -114,7 +114,7 @@ def run(self,
                         f'[{self.cfg.fusion_strategy}]...')
             ops = fuse_operators(ops, probe_res)
 
-        # 2. stats precompute only for filter ops
+        # 2. stats precompute only for filter or tagging ops
         logger.info('Computing the stats of dataset...')
         stats_collected = False
         for op in ops:
@@ -147,7 +147,7 @@ def run(self,
             compress(dataset)
 
         # 4. analysis and output result to the export path
-        # 4.1. Only consider fields in Fields.stats
+        # 4.1. Only consider fields in Fields.stats and Fields.meta
         # 4.2. For string fields, only consider its histogram
         # 4.3. For numeric fields, consider its histogram and box
         # 4.4. Otherwise, DO NOT analyze

From 4110a1a35f7733615e82f107ebed31a5ed920572 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 27 Dec 2024 12:02:08 +0800
Subject: [PATCH 08/26] * fix undefined device_map: using balanced in default
 or use `to` method to move models to specified devices * fix unrecognized
 dtype: only need torch.dtype instead of strings like 'fp16' * open unittest
 for image_diffusion_mapper

---
 data_juicer/utils/model_utils.py                | 16 ++++++++++++++--
 tests/ops/mapper/test_image_diffusion_mapper.py |  7 +------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
index 6e32434fa..7c9ab144a 100644
--- a/data_juicer/utils/model_utils.py
+++ b/data_juicer/utils/model_utils.py
@@ -59,6 +59,12 @@
     'ram_plus_swin_large_14m.pth',
 }
 
+TORCH_DTYPE_MAPPING = {
+    'fp32': torch.float32,
+    'fp16': torch.float16,
+    'bf16': torch.bfloat16,
+}
+
 
 def get_backup_model_link(model_name):
     for pattern, url in BACKUP_MODEL_LINKS.items():
@@ -282,8 +288,12 @@ def prepare_diffusion_model(pretrained_model_name_or_path, diffusion_type,
     """
     AUTOINSTALL.check(['torch', 'transformers'])
 
-    if 'device' in model_params:
-        model_params['device_map'] = model_params.pop('device')
+    device = model_params.pop('device', None)
+    if not device:
+        model_params['device_map'] = 'balanced'
+    if 'torch_dtype' in model_params:
+        model_params['torch_dtype'] = TORCH_DTYPE_MAPPING[
+            model_params['torch_dtype']]
 
     diffusion_type_to_pipeline = {
         'image2image': diffusers.AutoPipelineForImage2Image,
@@ -300,6 +310,8 @@ def prepare_diffusion_model(pretrained_model_name_or_path, diffusion_type,
     pipeline = diffusion_type_to_pipeline[diffusion_type]
     model = pipeline.from_pretrained(pretrained_model_name_or_path,
                                      **model_params)
+    if device:
+        model = model.to(device)
 
     return model
 
diff --git a/tests/ops/mapper/test_image_diffusion_mapper.py b/tests/ops/mapper/test_image_diffusion_mapper.py
index 5883a7ff7..55d42593d 100644
--- a/tests/ops/mapper/test_image_diffusion_mapper.py
+++ b/tests/ops/mapper/test_image_diffusion_mapper.py
@@ -6,13 +6,8 @@
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.image_diffusion_mapper import ImageDiffusionMapper
 from data_juicer.utils.mm_utils import SpecialTokens
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-
-# Skip tests for this OP in the GitHub actions due to OOM on the current runner
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class ImageDiffusionMapperTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',

From 829a0c70287fad4ede62a1ae73e83548445eb45d Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 27 Dec 2024 15:32:57 +0800
Subject: [PATCH 09/26] * fix bugs in video_captioning_from_summarizer_mapper
 due to reorganization of meta and tags

---
 .../mapper/video_captioning_from_summarizer_mapper.py  | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py
index 67eb7e234..ca095af33 100644
--- a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py
+++ b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py
@@ -197,17 +197,19 @@ def _process_single_sample(self, sample, rank=None):
             temp_sample = {
                 self.text_key: chunk,
                 self.video_key: loaded_video_keys[offset:offset + vid_count],
+                Fields.meta: {}
             }
 
             captioned_text_list = []
             # tag ops
             for op in self.tag_op_list:
                 temp_sample = op.process(temp_sample, rank=rank)
-            if Fields.video_audio_tags in temp_sample:
+            if Fields.video_audio_tags in temp_sample[Fields.meta]:
                 captioned_text_list.extend(
-                    temp_sample[Fields.video_audio_tags])
-            if Fields.video_frame_tags in temp_sample:
-                for tag_list in temp_sample[Fields.video_frame_tags]:
+                    temp_sample[Fields.meta][Fields.video_audio_tags])
+            if Fields.video_frame_tags in temp_sample[Fields.meta]:
+                for tag_list in temp_sample[Fields.meta][
+                        Fields.video_frame_tags]:
                     captioned_text_list.extend(tag_list[self.keep_tag_num])
             # cap ops
             for op in self.cap_op_list:

From 3aee5313e7571803f21b5e9d6c607aa0b23be81c Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 27 Dec 2024 16:28:55 +0800
Subject: [PATCH 10/26] - remove unused imports

---
 data_juicer/ops/aggregator/entity_attribute_aggregator.py     | 4 ----
 .../ops/aggregator/most_relavant_entities_aggregator.py       | 4 ----
 data_juicer/ops/aggregator/nested_aggregator.py               | 4 ----
 3 files changed, 12 deletions(-)

diff --git a/data_juicer/ops/aggregator/entity_attribute_aggregator.py b/data_juicer/ops/aggregator/entity_attribute_aggregator.py
index 96fbbb63f..16ec5fd07 100644
--- a/data_juicer/ops/aggregator/entity_attribute_aggregator.py
+++ b/data_juicer/ops/aggregator/entity_attribute_aggregator.py
@@ -8,14 +8,10 @@
 from data_juicer.utils.common_utils import (avg_split_string_list_under_limit,
                                             is_string_list, nested_access,
                                             nested_set)
-from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.model_utils import get_model, prepare_model
 
 from .nested_aggregator import NestedAggregator
 
-torch = LazyLoader('torch', 'torch')
-vllm = LazyLoader('vllm', 'vllm')
-
 OP_NAME = 'entity_attribute_aggregator'
 
 
diff --git a/data_juicer/ops/aggregator/most_relavant_entities_aggregator.py b/data_juicer/ops/aggregator/most_relavant_entities_aggregator.py
index 69e1a209c..7ca49f505 100644
--- a/data_juicer/ops/aggregator/most_relavant_entities_aggregator.py
+++ b/data_juicer/ops/aggregator/most_relavant_entities_aggregator.py
@@ -7,14 +7,10 @@
 from data_juicer.ops.base_op import OPERATORS, Aggregator
 from data_juicer.utils.common_utils import (is_string_list, nested_access,
                                             nested_set)
-from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.model_utils import get_model, prepare_model
 
 from ..common import split_text_by_punctuation
 
-torch = LazyLoader('torch', 'torch')
-vllm = LazyLoader('vllm', 'vllm')
-
 OP_NAME = 'most_relavant_entities_aggregator'
 
 
diff --git a/data_juicer/ops/aggregator/nested_aggregator.py b/data_juicer/ops/aggregator/nested_aggregator.py
index 124eb1470..ab25e057d 100644
--- a/data_juicer/ops/aggregator/nested_aggregator.py
+++ b/data_juicer/ops/aggregator/nested_aggregator.py
@@ -6,12 +6,8 @@
 from data_juicer.ops.base_op import OPERATORS, Aggregator
 from data_juicer.utils.common_utils import (avg_split_string_list_under_limit,
                                             is_string_list, nested_access)
-from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-torch = LazyLoader('torch', 'torch')
-vllm = LazyLoader('vllm', 'vllm')
-
 OP_NAME = 'nested_aggregator'
 
 

From 480592856f20780315e4ee741951460f69deac94 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 30 Dec 2024 10:59:51 +0800
Subject: [PATCH 11/26] * open unittest for nlpcda_en_mapper

---
 tests/ops/mapper/test_nlpcda_zh_mapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ops/mapper/test_nlpcda_zh_mapper.py b/tests/ops/mapper/test_nlpcda_zh_mapper.py
index ce21ea55d..df4088c15 100644
--- a/tests/ops/mapper/test_nlpcda_zh_mapper.py
+++ b/tests/ops/mapper/test_nlpcda_zh_mapper.py
@@ -9,7 +9,7 @@
 
 # Skip tests for this OP in the GitHub actions due to unknown UnicodeEncodeError
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class NlpaugEnMapperTest(DataJuicerTestCaseBase):
 
     def setUp(self):

From 6020a41cb501db199a57493d1a3f33f3cca34d3a Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 30 Dec 2024 12:02:32 +0800
Subject: [PATCH 12/26] * set the default encoding of stdout to utf-8

---
 data_juicer/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/data_juicer/__init__.py b/data_juicer/__init__.py
index 7b7173c37..0bfdfdc48 100644
--- a/data_juicer/__init__.py
+++ b/data_juicer/__init__.py
@@ -10,6 +10,9 @@
 
 from data_juicer.utils.availability_utils import _is_package_available
 
+# set the default encoding to utf-8 explicitly
+sys.stdout.reconfigure(encoding='utf-8')
+
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 
 # For now, only INFO will be shown. Later the severity level will be changed

From d464b7950b14a9115a99c384665ebdd334f8bbb4 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 30 Dec 2024 14:20:31 +0800
Subject: [PATCH 13/26] * set the default encoding of stdout to utf-8

---
 data_juicer/__init__.py                    | 3 ---
 data_juicer/ops/mapper/nlpcda_zh_mapper.py | 4 ++++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/data_juicer/__init__.py b/data_juicer/__init__.py
index 0bfdfdc48..7b7173c37 100644
--- a/data_juicer/__init__.py
+++ b/data_juicer/__init__.py
@@ -10,9 +10,6 @@
 
 from data_juicer.utils.availability_utils import _is_package_available
 
-# set the default encoding to utf-8 explicitly
-sys.stdout.reconfigure(encoding='utf-8')
-
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 
 # For now, only INFO will be shown. Later the severity level will be changed
diff --git a/data_juicer/ops/mapper/nlpcda_zh_mapper.py b/data_juicer/ops/mapper/nlpcda_zh_mapper.py
index 80c874dbf..d9538047e 100644
--- a/data_juicer/ops/mapper/nlpcda_zh_mapper.py
+++ b/data_juicer/ops/mapper/nlpcda_zh_mapper.py
@@ -1,3 +1,4 @@
+import sys
 from copy import deepcopy
 
 from loguru import logger
@@ -8,6 +9,9 @@
 
 from ..base_op import OPERATORS, Mapper
 
+# set the default encoding to utf-8 explicitly
+sys.stdout.reconfigure(encoding='utf-8')
+
 nlpcda = LazyLoader('nlpcda', 'nlpcda')
 
 OP_NAME = 'nlpcda_zh_mapper'

From 8981b63d1650645f95ae4b74a058a899406de0b5 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 30 Dec 2024 15:13:16 +0800
Subject: [PATCH 14/26] * set the default encoding of stdout to utf-8

---
 data_juicer/ops/mapper/nlpcda_zh_mapper.py | 4 ----
 tests/ops/mapper/test_nlpcda_zh_mapper.py  | 3 +++
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/data_juicer/ops/mapper/nlpcda_zh_mapper.py b/data_juicer/ops/mapper/nlpcda_zh_mapper.py
index d9538047e..80c874dbf 100644
--- a/data_juicer/ops/mapper/nlpcda_zh_mapper.py
+++ b/data_juicer/ops/mapper/nlpcda_zh_mapper.py
@@ -1,4 +1,3 @@
-import sys
 from copy import deepcopy
 
 from loguru import logger
@@ -9,9 +8,6 @@
 
 from ..base_op import OPERATORS, Mapper
 
-# set the default encoding to utf-8 explicitly
-sys.stdout.reconfigure(encoding='utf-8')
-
 nlpcda = LazyLoader('nlpcda', 'nlpcda')
 
 OP_NAME = 'nlpcda_zh_mapper'
diff --git a/tests/ops/mapper/test_nlpcda_zh_mapper.py b/tests/ops/mapper/test_nlpcda_zh_mapper.py
index df4088c15..803148e36 100644
--- a/tests/ops/mapper/test_nlpcda_zh_mapper.py
+++ b/tests/ops/mapper/test_nlpcda_zh_mapper.py
@@ -13,6 +13,9 @@
 class NlpaugEnMapperTest(DataJuicerTestCaseBase):
 
     def setUp(self):
+        import sys
+        # set the default encoding to utf-8 explicitly
+        sys.stdout.reconfigure(encoding='utf-8')
         self.samples = Dataset.from_dict({
             'text': ['这里一共有5种不同的数据增强方法', '这是不带数字的测试样例'],
             'meta': ['meta information', 'meta information without numbers'],

From 2025a2e80236105b9e14a1ba65e1a57ac7634dfe Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 30 Dec 2024 16:08:17 +0800
Subject: [PATCH 15/26] * test for raft

---
 tests/ops/filter/test_video_motion_score_raft_filter.py | 3 ++-
 tests/ops/mapper/test_nlpcda_zh_mapper.py               | 5 +----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/ops/filter/test_video_motion_score_raft_filter.py b/tests/ops/filter/test_video_motion_score_raft_filter.py
index 89f9e0548..1402e203a 100644
--- a/tests/ops/filter/test_video_motion_score_raft_filter.py
+++ b/tests/ops/filter/test_video_motion_score_raft_filter.py
@@ -10,7 +10,7 @@
 
 # skip due to conflicts when run lazy_load in multiprocessing in librosa
 # tests passed locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class VideoMotionScoreRaftFilterTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
@@ -26,6 +26,7 @@ def _run_helper(self, op, source_list, target_list, np=1):
                                          column=[{}] * dataset.num_rows)
         dataset = dataset.map(op.compute_stats, num_proc=np)
         dataset = dataset.filter(op.process, num_proc=np)
+        print(dataset[Fields.stats])
         dataset = dataset.select_columns(column_names=[op.video_key])
         res_list = dataset.to_list()
         self.assertEqual(res_list, target_list)
diff --git a/tests/ops/mapper/test_nlpcda_zh_mapper.py b/tests/ops/mapper/test_nlpcda_zh_mapper.py
index 803148e36..ce21ea55d 100644
--- a/tests/ops/mapper/test_nlpcda_zh_mapper.py
+++ b/tests/ops/mapper/test_nlpcda_zh_mapper.py
@@ -9,13 +9,10 @@
 
 # Skip tests for this OP in the GitHub actions due to unknown UnicodeEncodeError
 # These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
+@SKIPPED_TESTS.register_module()
 class NlpaugEnMapperTest(DataJuicerTestCaseBase):
 
     def setUp(self):
-        import sys
-        # set the default encoding to utf-8 explicitly
-        sys.stdout.reconfigure(encoding='utf-8')
         self.samples = Dataset.from_dict({
             'text': ['这里一共有5种不同的数据增强方法', '这是不带数字的测试样例'],
             'meta': ['meta information', 'meta information without numbers'],

From 85e757dad5f0208681bc83f226bff0d4bae44aee Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 30 Dec 2024 17:36:51 +0800
Subject: [PATCH 16/26] * change the thresholds

---
 .../filter/test_video_motion_score_raft_filter.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tests/ops/filter/test_video_motion_score_raft_filter.py b/tests/ops/filter/test_video_motion_score_raft_filter.py
index 1402e203a..c41efa3b4 100644
--- a/tests/ops/filter/test_video_motion_score_raft_filter.py
+++ b/tests/ops/filter/test_video_motion_score_raft_filter.py
@@ -6,11 +6,7 @@
 from data_juicer.ops.filter.video_motion_score_raft_filter import \
     VideoMotionScoreRaftFilter
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
-
-# skip due to conflicts when run lazy_load in multiprocessing in librosa
-# tests passed locally.
-# @SKIPPED_TESTS.register_module()
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 class VideoMotionScoreRaftFilterTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
@@ -26,7 +22,6 @@ def _run_helper(self, op, source_list, target_list, np=1):
                                          column=[{}] * dataset.num_rows)
         dataset = dataset.map(op.compute_stats, num_proc=np)
         dataset = dataset.filter(op.process, num_proc=np)
-        print(dataset[Fields.stats])
         dataset = dataset.select_columns(column_names=[op.video_key])
         res_list = dataset.to_list()
         self.assertEqual(res_list, target_list)
@@ -134,7 +129,7 @@ def test_middle(self):
             'videos': [self.vid3_path]
         }]
         tgt_list = [{'videos': [self.vid2_path]}]
-        op = VideoMotionScoreRaftFilter(min_score=3, max_score=10.5)
+        op = VideoMotionScoreRaftFilter(min_score=3, max_score=10.2)
         self._run_helper(op, ds_list, tgt_list)
 
     def test_any(self):
@@ -151,7 +146,7 @@ def test_any(self):
             'videos': [self.vid2_path, self.vid3_path]
         }]
         op = VideoMotionScoreRaftFilter(min_score=3,
-                                    max_score=10.5,
+                                    max_score=10.2,
                                     any_or_all='any')
         self._run_helper(op, ds_list, tgt_list)
 
@@ -165,7 +160,7 @@ def test_all(self):
         }]
         tgt_list = []
         op = VideoMotionScoreRaftFilter(min_score=3,
-                                    max_score=10.5,
+                                    max_score=10.2,
                                     any_or_all='all')
         self._run_helper(op, ds_list, tgt_list)
 
@@ -181,7 +176,7 @@ def test_parallel(self):
             'videos': [self.vid3_path]
         }]
         tgt_list = [{'videos': [self.vid2_path]}]
-        op = VideoMotionScoreRaftFilter(min_score=3, max_score=10.5)
+        op = VideoMotionScoreRaftFilter(min_score=3, max_score=10.2)
         self._run_helper(op, ds_list, tgt_list, np=2)
 
 

From b41833ef1d1fb54a7060159aa6e31eaa78a99c29 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Thu, 9 Jan 2025 20:17:51 +0800
Subject: [PATCH 17/26] + add mem_required for generate_qa_from_text_mapper *
 fix typos

---
 configs/config_all.yaml | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 82cd6824e..785abce43 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -185,6 +185,7 @@ process:
       enable_vllm: false                                      # Whether to use vllm for inference acceleration.
       model_params: {}                                        # Parameters for initializing the model.
       sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
+      mem_required: '31GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_blur_mapper:                                      # mapper to blur images.
       p: 0.2                                                  # probability of the image being blured
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
@@ -206,7 +207,7 @@ process:
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
       prompt: null                                            # a string prompt to guide the generation of blip2 model for all samples globally. It's None in default, which means no prompt provided.
       prompt_key: null                                        # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
-      mem_required: '16GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '16GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_diffusion_mapper:                                 # generate images by diffusion model
       hf_diffusion: 'CompVis/stable-diffusion-v1-4'           # stable diffusion model name on huggingface to generate image
       torch_dtype: 'fp32'                                     # the floating point type used to load the diffusion model. Can be one of ['fp32', 'fp16', 'bf16']
@@ -217,7 +218,7 @@ process:
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated images in the final datasets and the original images will be removed. It's True in default.
       caption_key: null                                       # the key name of fields in samples to store captions for each images, the caption guide the diffusion model to produce what the image is
       hf_img2seq: 'Salesforce/blip2-opt-2.7b'                 # model name on huggingface to generate caption if caption_key is null
-      mem_required: '8GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '8GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_face_blur_mapper:                                 # blur faces detected in images
       cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
@@ -329,7 +330,7 @@ process:
       trust_remote_code: True                                 # for loading huggingface model.
   - video_captioning_from_audio_mapper:                     # caption a video according to its audio streams based on Qwen-Audio model
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only captioned sample in the final datasets and the original sample will be removed. It's True in default.
-      mem_required: '30GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '30GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_captioning_from_frames_mapper:                    # generate samples whose captions are generated based on an image-to-text model and sampled video frames. Captions from different frames will be concatenated to a single string.
       hf_img2seq: 'Salesforce/blip2-opt-2.7b'                 # image-to-text model name on huggingface to generate caption
       caption_num: 1                                          # how many candidate captions to generate for each video
@@ -341,7 +342,7 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       horizontal_flip: false                                  # flip frame image horizontally (left to right).
       vertical_flip: false                                    # flip frame image vertically (top to bottom).
-      mem_required: '20GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '20GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_captioning_from_summarizer_mapper:                # generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...)
       hf_summarizer: 'mrm8488/flan-t5-large-finetuned-openai-summarize_from_feedback'  # the summarizer model used to summarize texts generated by other methods.
       consider_video_caption_from_video: true                 # whether to consider the video caption generated from video directly in the summarization process. Default: True.
@@ -355,7 +356,7 @@ process:
       vid_tag_from_frm_args: null                             # the arg dict for video tagging from sampled frames from the video with keys are the arg names and values are the arg values. Default: None.
       keep_tag_num: 5                                         # max number N of tags from sampled frames to keep. Too many tags might bring negative influence to summarized text, so we consider to only keep the N most frequent tags. Default: 5.
       keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only summarized captions in the final datasets and the original captions will be removed. It's True in default.
-      mem_required: '40GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '40GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_captioning_from_video_mapper:                     # generate captions by frame images extracted from video to augment datasets
       hf_video_blip: 'kpyu/video-blip-opt-2.7b-ego4d'         # video-blip model name on huggingface to generate caption
       caption_num: 1                                          # how many candidate captions to generate for each video
@@ -367,7 +368,7 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       horizontal_flip: false                                  # flip frame image horizontally (left to right).
       vertical_flip: false                                    # flip frame image vertically (top to bottom).
-      mem_required: '20GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '20GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_extract_frames_mapper:                            # extract frames from video files according to specified methods
       frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
@@ -410,7 +411,7 @@ process:
   - video_tagging_from_audio_mapper:                        # Mapper to generate video tags from audio streams extracted from the video.
       hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593'       # Huggingface model name for the audio classification model.
       tag_field_name: '__dj__video_audio_tags__'              # the field name to store the tags. It's "__dj__video_audio_tags__" in default.
-      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_tagging_from_frames_mapper:                       # Mapper to generate video tags from frames extracted from the video.
       frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
@@ -456,7 +457,7 @@ process:
       min_score: 0.3                                          # the min aesthetics score of filter range
       max_score: 1.0                                          # the max aesthetics score of filter range
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_aspect_ratio_filter:                              # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
       min_ratio: 0.333                                        # the min aspect ratio of filter range
       max_ratio: 3.0                                          # the max aspect ratio of filter range
@@ -473,7 +474,7 @@ process:
       hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification
       score_threshold: 0.5                                    # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_pair_similarity_filter:                           # filter samples according to the similarity score between the image pair.
       hf_clip: 'openai/clip-vit-base-patch32'                 # model name of the CLIP model on huggingface
       min_score: 0.1                                          # the min similarity score of filter range
@@ -497,7 +498,7 @@ process:
       vertical_flip: false                                    # flip image vertically (top to bottom).
       reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_text_similarity_filter:                           # filter samples according to the similarity between image and text.
       hf_clip: openai/clip-vit-base-patch32                   # name of used Hugging Face clip
       min_score: 0.1                                          # the min similarity of filter range
@@ -506,12 +507,12 @@ process:
       vertical_flip: false                                    # flip image vertically (top to bottom).
       reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - image_watermark_filter:                                 # filter samples according to the predicted watermark probabilities of images in them
       hf_watermark_model: amrul-hzz/watermark_detector        # Huggingface model name for watermark classification
       prob_threshold: 0.8                                     # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - language_id_score_filter:                               # filter text in specific language with language scores larger than a specific max value
       lang: en                                                # keep text in what language
       min_score: 0.8                                          # the min language scores to filter text
@@ -532,7 +533,7 @@ process:
       conf_thr: 0.0                                           # the confidence score threshold for removing low-confidence bboxes
       reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - special_characters_filter:                              # filter text with special-char ratio out of specific range
       min_ratio: 0.0                                          # the min ratio of filter range
       max_ratio: 0.25                                         # the max ratio of filter range
@@ -575,7 +576,7 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       reduce_mode: avg                                        # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_aspect_ratio_filter:                              # filter samples according to the aspect ratios of videos (a fraction of width by height, r=w/h) in them
       min_ratio: 9/21                                         # the minimum aspect ratio to keep samples, supported format is a string, such as "9:21" or "9/21".
       max_ratio: 21/9                                         # the maximum aspect ratio to keep samples, supported format is a string, such as "21:9" or "21/9".
@@ -594,7 +595,7 @@ process:
       vertical_flip: false                                    # flip frame image vertically (top to bottom).
       reduce_mode: avg                                        # reduce mode when one text corresponds to multiple videos in a chunk,  must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
-      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_motion_score_filter:                              # Keep samples with video motion scores within a specific range.
       min_score: 0.25                                         # the minimum motion score to keep samples
       max_score: 10000.0                                      # the maximum motion score to keep samples
@@ -620,7 +621,7 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       reduce_mode: avg                                        # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_ocr_area_ratio_filter:                            # Keep data samples whose detected text area ratios for specified frames in the video are within a specified range.
       min_area_ratio: 0                                       # the min ocr area ratio to keep samples. It's 0 by default.
       max_area_ratio: 1.0                                     # the max ocr area ratio to keep samples. It's 1.0 by default.
@@ -640,7 +641,7 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       reduce_mode: avg                                        # reduce mode for multiple sampled video frames to compute final predicted watermark probabilities of videos, must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
-      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - video_tagging_from_frames_filter:                       # filter samples according to the tags of videos in them
       tags: ['people']                                        # a tag list to shift the videos, total tags can be found in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt
       contain: any                                            # require the videos containing 'any' or 'all' given tags. When tags equal to [], 'all' keeps all samples, 'any' keeps no sample.

From fb503dbd79e899e49833a0c456473d7ea1b3e120 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 13 Jan 2025 17:02:58 +0800
Subject: [PATCH 18/26] + add mem_required for two ops

---
 configs/config_all.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 785abce43..03a370946 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -179,6 +179,7 @@ process:
       enable_vllm: false                                      # Whether to use vllm for inference acceleration.
       model_params: {}                                        # Parameters for initializing the model.
       sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
+      mem_required: '31GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - generate_qa_from_text_mapper:                           # mapper to generate question and answer pairs from text.
       hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'           # Model name on huggingface to generate question and answer pairs.
       output_pattern: null                                    # Regular expression pattern to extract questions and answers from model response.
@@ -257,6 +258,7 @@ process:
       enable_vllm: false                                      # whether to use vllm for inference acceleration.
       model_params: {}                                        # Parameters for initializing the model.
       sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
+      mem_required: '31GB'                                    # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
   - optimize_query_mapper:                                  # optimize query in question-answer pairs.
   - optimize_response_mapper:                               # optimize response in question-answer pairs.
   - pair_preference_mapper:                                 # construct paired preference samples.

From 432de0fef6253d47441a00f35954bb89f798f420 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Wed, 15 Jan 2025 14:48:26 +0800
Subject: [PATCH 19/26] - open unittest for generate_qa_from_text_mapper

---
 tests/ops/mapper/test_generate_qa_from_text_mapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ops/mapper/test_generate_qa_from_text_mapper.py b/tests/ops/mapper/test_generate_qa_from_text_mapper.py
index 7b3131fd3..6019d8949 100644
--- a/tests/ops/mapper/test_generate_qa_from_text_mapper.py
+++ b/tests/ops/mapper/test_generate_qa_from_text_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to ?
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class GenerateQAFromTextMapperTest(DataJuicerTestCaseBase):
     text_key = 'text'
 

From 06267ba00ff4ff6e3a6d03ffe3befbaf789d8328 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Wed, 15 Jan 2025 17:42:59 +0800
Subject: [PATCH 20/26] - open unittest for generate_qa_from_examples_mapper

---
 tests/ops/mapper/test_generate_qa_from_examples_mapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py
index 023394f9b..e1e18bd4f 100644
--- a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py
+++ b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py
@@ -10,7 +10,7 @@
 
 # Skip tests for this OP in the GitHub actions due to ?
 # These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
+# @SKIPPED_TESTS.register_module()
 class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase):
     text_key = 'text'
 

From bece0d12a0dca95013043763f042525739ea4238 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 20 Jan 2025 16:24:20 +0800
Subject: [PATCH 21/26] * fix skip_op_error & update_sampling_params

---
 data_juicer/ops/base_op.py                                 | 2 +-
 data_juicer/ops/mapper/generate_qa_from_examples_mapper.py | 7 +++----
 data_juicer/ops/mapper/generate_qa_from_text_mapper.py     | 7 +++----
 data_juicer/ops/mapper/optimize_qa_mapper.py               | 7 +++----
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
index 58efbbb5e..e86b844c5 100644
--- a/data_juicer/ops/base_op.py
+++ b/data_juicer/ops/base_op.py
@@ -108,7 +108,7 @@ def wrapper(sample, *args, **kwargs):
                 else:
                     return [res]
             except Exception as e:
-                if skip_op_error:
+                if not skip_op_error:
                     raise
                 from loguru import logger
                 logger.error(f'An error occurred in {op_name} when processing '
diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py
index a7a05ae19..b8c131dcb 100644
--- a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py
+++ b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py
@@ -118,6 +118,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -140,10 +143,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
         self.seed_qa_samples = self._load_seed_qa_samples()
         if len(self.seed_qa_samples) == 0:
             raise ValueError('No QA data was parsed from the seed file!')
diff --git a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py
index 832f1c782..004e16782 100644
--- a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py
+++ b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py
@@ -85,6 +85,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -107,10 +110,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
     def parse_output(self, raw_output):
         logger.debug(raw_output)
         qa_list = []
diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py
index 8402f1683..04f2a6946 100644
--- a/data_juicer/ops/mapper/optimize_qa_mapper.py
+++ b/data_juicer/ops/mapper/optimize_qa_mapper.py
@@ -77,6 +77,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -99,10 +102,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
     def build_input(self, sample):
         qa_pair = self.qa_pair_template.format(sample[self.query_key],
                                                sample[self.response_key])

From 88e9aa446887bd0105c5372cbaf24d84fdeff8c7 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 20 Jan 2025 19:12:14 +0800
Subject: [PATCH 22/26] * update vllm version requirement for generation_config
 param

---
 environments/science_requires.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/science_requires.txt b/environments/science_requires.txt
index c90d8e72f..52e87a610 100644
--- a/environments/science_requires.txt
+++ b/environments/science_requires.txt
@@ -24,7 +24,7 @@ simple-aesthetics-predictor
 scenedetect[opencv]
 ffmpeg-python
 opencv-python
-vllm>=0.1.3
+vllm>=0.6.5
 rouge
 dashscope
 openai

From 26d0f841d5b57501471de8ed287518442da3a7e3 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 20 Jan 2025 19:13:11 +0800
Subject: [PATCH 23/26] * skip vllm ops

---
 tests/ops/mapper/test_generate_qa_from_examples_mapper.py | 2 +-
 tests/ops/mapper/test_generate_qa_from_text_mapper.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py
index e1e18bd4f..023394f9b 100644
--- a/tests/ops/mapper/test_generate_qa_from_examples_mapper.py
+++ b/tests/ops/mapper/test_generate_qa_from_examples_mapper.py
@@ -10,7 +10,7 @@
 
 # Skip tests for this OP in the GitHub actions due to ?
 # These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
+@SKIPPED_TESTS.register_module()
 class GenerateQAFromExamplesMapperTest(DataJuicerTestCaseBase):
     text_key = 'text'
 
diff --git a/tests/ops/mapper/test_generate_qa_from_text_mapper.py b/tests/ops/mapper/test_generate_qa_from_text_mapper.py
index 6019d8949..7b3131fd3 100644
--- a/tests/ops/mapper/test_generate_qa_from_text_mapper.py
+++ b/tests/ops/mapper/test_generate_qa_from_text_mapper.py
@@ -11,7 +11,7 @@
 
 # Skip tests for this OP in the GitHub actions due to ?
 # These tests have been tested locally.
-# @SKIPPED_TESTS.register_module()
+@SKIPPED_TESTS.register_module()
 class GenerateQAFromTextMapperTest(DataJuicerTestCaseBase):
     text_key = 'text'
 

From 68741912e7bcff37983b2799c8c46973c458ee46 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Tue, 21 Jan 2025 14:34:12 +0800
Subject: [PATCH 24/26] * open unittests for api-related ops

---
 .github/workflows/unit-test.yml               |  2 +-
 .../test_entity_attribute_aggregator.py       |  3 +-
 .../aggregator/test_meta_tags_aggregator.py   |  4 +--
 .../test_most_relavant_entities_aggregator.py |  4 +--
 .../ops/aggregator/test_nested_aggregator.py  |  4 +--
 tests/ops/mapper/test_calibrate_qa_mapper.py  | 28 +------------------
 .../ops/mapper/test_calibrate_query_mapper.py |  7 +----
 .../mapper/test_calibrate_response_mapper.py  |  6 +---
 .../test_dialog_intent_detection_mapper.py    |  6 +---
 .../test_dialog_sentiment_detection_mapper.py |  6 +---
 .../test_dialog_sentiment_intensity_mapper.py |  7 +----
 .../test_dialog_topic_detection_mapper.py     |  7 +----
 .../test_extract_entity_attribute_mapper.py   |  7 +----
 .../test_extract_entity_relation_mapper.py    |  7 +----
 tests/ops/mapper/test_extract_event_mapper.py |  7 +----
 .../ops/mapper/test_extract_keyword_mapper.py |  7 +----
 .../mapper/test_extract_nickname_mapper.py    |  7 +----
 .../test_extract_support_text_mapper.py       |  7 +----
 .../ops/mapper/test_pair_preference_mapper.py |  6 +---
 .../mapper/test_relation_identity_mapper.py   |  7 +----
 tests/ops/mapper/test_text_chunk_mapper.py    |  3 +-
 21 files changed, 21 insertions(+), 121 deletions(-)

diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index ce7af4474..9161e3bc7 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -44,7 +44,7 @@ jobs:
     - name: Run unittest standalone
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
-        docker compose exec ray-head python tests/run.py --tag standalone
+        docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head python tests/run.py --tag standalone
 
     - name: Run unittest ray
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
diff --git a/tests/ops/aggregator/test_entity_attribute_aggregator.py b/tests/ops/aggregator/test_entity_attribute_aggregator.py
index 1d6a4b1df..68b36b0dc 100644
--- a/tests/ops/aggregator/test_entity_attribute_aggregator.py
+++ b/tests/ops/aggregator/test_entity_attribute_aggregator.py
@@ -4,11 +4,10 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.aggregator import EntityAttributeAggregator
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, BatchMetaKeys, MetaKeys
 
 
-@SKIPPED_TESTS.register_module()
 class EntityAttributeAggregatorTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples, output_key=BatchMetaKeys.entity_attribute):
diff --git a/tests/ops/aggregator/test_meta_tags_aggregator.py b/tests/ops/aggregator/test_meta_tags_aggregator.py
index 7aba225ae..ecf172012 100644
--- a/tests/ops/aggregator/test_meta_tags_aggregator.py
+++ b/tests/ops/aggregator/test_meta_tags_aggregator.py
@@ -5,10 +5,8 @@
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.aggregator import MetaTagsAggregator
 from data_juicer.utils.constant import Fields, MetaKeys
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-
-@SKIPPED_TESTS.register_module()
 class MetaTagsAggregatorTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples):
diff --git a/tests/ops/aggregator/test_most_relavant_entities_aggregator.py b/tests/ops/aggregator/test_most_relavant_entities_aggregator.py
index 5912877ca..21e961cec 100644
--- a/tests/ops/aggregator/test_most_relavant_entities_aggregator.py
+++ b/tests/ops/aggregator/test_most_relavant_entities_aggregator.py
@@ -4,12 +4,10 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.aggregator import MostRelavantEntitiesAggregator
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 from data_juicer.utils.constant import Fields, BatchMetaKeys, MetaKeys
 
-
-@SKIPPED_TESTS.register_module()
 class MostRelavantEntitiesAggregatorTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples, output_key=BatchMetaKeys.most_relavant_entities):
diff --git a/tests/ops/aggregator/test_nested_aggregator.py b/tests/ops/aggregator/test_nested_aggregator.py
index 697e17e95..87533a687 100644
--- a/tests/ops/aggregator/test_nested_aggregator.py
+++ b/tests/ops/aggregator/test_nested_aggregator.py
@@ -4,12 +4,10 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.aggregator import NestedAggregator
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 from data_juicer.utils.constant import Fields, MetaKeys
 
-
-@SKIPPED_TESTS.register_module()
 class NestedAggregatorTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples, output_key=MetaKeys.event_description):
diff --git a/tests/ops/mapper/test_calibrate_qa_mapper.py b/tests/ops/mapper/test_calibrate_qa_mapper.py
index 5755ed2b1..9caf57af8 100644
--- a/tests/ops/mapper/test_calibrate_qa_mapper.py
+++ b/tests/ops/mapper/test_calibrate_qa_mapper.py
@@ -1,18 +1,10 @@
-import os
 import unittest
-from unittest.mock import Mock, patch
-
-import httpx
 from loguru import logger
 
 from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP because the API call is not configured yet.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class CalibrateQAMapperTest(DataJuicerTestCaseBase):
 
     def _run_op(self, op):
@@ -81,23 +73,5 @@ def test_args(self):
             response_path='choices.0.message.content')
         self._run_op(op)
 
-    @patch('httpx.Client.send')
-    def test_retry(self, mock_send):
-        mock_response = Mock()
-        mock_response.status_code = 408
-        mock_response.headers = {}
-        mock_request = Mock()
-        mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
-            '408 Client Error: Request Timeout',
-            request=mock_request,
-            response=mock_response)
-        mock_send.return_value = mock_response
-
-        with self.assertLogs(level='DEBUG') as cm:
-            op = CalibrateQAMapper(api_model='test',
-                                   model_params={'max_retries': 3})
-            op.process({'text': '', 'query': '', 'response': ''})
-        self.assertIn('3 retries left', '\n'.join(cm.output))
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/ops/mapper/test_calibrate_query_mapper.py b/tests/ops/mapper/test_calibrate_query_mapper.py
index f95b6c5dc..f0ce5d5d2 100644
--- a/tests/ops/mapper/test_calibrate_query_mapper.py
+++ b/tests/ops/mapper/test_calibrate_query_mapper.py
@@ -3,13 +3,8 @@
 from loguru import logger
 
 from data_juicer.ops.mapper.calibrate_query_mapper import CalibrateQueryMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-
-# Skip tests for this OP because the API call is not configured yet.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class CalibrateQueryMapperTest(DataJuicerTestCaseBase):
 
     def _run_op(self, api_model, response_path=None):
diff --git a/tests/ops/mapper/test_calibrate_response_mapper.py b/tests/ops/mapper/test_calibrate_response_mapper.py
index 4a9ddbe11..c113709f4 100644
--- a/tests/ops/mapper/test_calibrate_response_mapper.py
+++ b/tests/ops/mapper/test_calibrate_response_mapper.py
@@ -4,13 +4,9 @@
 
 from data_juicer.ops.mapper.calibrate_response_mapper import \
     CalibrateResponseMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP because the API call is not configured yet.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class CalibrateResponseMapperTest(DataJuicerTestCaseBase):
 
     def _run_op(self, api_model, response_path=None):
diff --git a/tests/ops/mapper/test_dialog_intent_detection_mapper.py b/tests/ops/mapper/test_dialog_intent_detection_mapper.py
index d2a44ab65..b7765fd19 100644
--- a/tests/ops/mapper/test_dialog_intent_detection_mapper.py
+++ b/tests/ops/mapper/test_dialog_intent_detection_mapper.py
@@ -4,13 +4,9 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.dialog_intent_detection_mapper import DialogIntentDetectionMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class TestDialogIntentDetectionMapper(DataJuicerTestCaseBase):
     # before runing this test, set below environment variables:
     # export OPENAI_API_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
diff --git a/tests/ops/mapper/test_dialog_sentiment_detection_mapper.py b/tests/ops/mapper/test_dialog_sentiment_detection_mapper.py
index 5f0763149..a118d096d 100644
--- a/tests/ops/mapper/test_dialog_sentiment_detection_mapper.py
+++ b/tests/ops/mapper/test_dialog_sentiment_detection_mapper.py
@@ -5,13 +5,9 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.dialog_sentiment_detection_mapper import DialogSentimentDetectionMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class TestDialogSentimentDetectionMapper(DataJuicerTestCaseBase):
     # before runing this test, set below environment variables:
     # export OPENAI_API_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
diff --git a/tests/ops/mapper/test_dialog_sentiment_intensity_mapper.py b/tests/ops/mapper/test_dialog_sentiment_intensity_mapper.py
index 93fdc54f6..f5df993ae 100644
--- a/tests/ops/mapper/test_dialog_sentiment_intensity_mapper.py
+++ b/tests/ops/mapper/test_dialog_sentiment_intensity_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.dialog_sentiment_intensity_mapper import DialogSentimentIntensityMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class TestDialogSentimentIntensityMapper(DataJuicerTestCaseBase):
     # before runing this test, set below environment variables:
     # export OPENAI_API_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
diff --git a/tests/ops/mapper/test_dialog_topic_detection_mapper.py b/tests/ops/mapper/test_dialog_topic_detection_mapper.py
index d6d1f5e3d..0cceea32d 100644
--- a/tests/ops/mapper/test_dialog_topic_detection_mapper.py
+++ b/tests/ops/mapper/test_dialog_topic_detection_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.dialog_topic_detection_mapper import DialogTopicDetectionMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class TestDialogTopicDetectionMapper(DataJuicerTestCaseBase):
     # before runing this test, set below environment variables:
     # export OPENAI_API_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
diff --git a/tests/ops/mapper/test_extract_entity_attribute_mapper.py b/tests/ops/mapper/test_extract_entity_attribute_mapper.py
index 0ef2579e2..18811cde5 100644
--- a/tests/ops/mapper/test_extract_entity_attribute_mapper.py
+++ b/tests/ops/mapper/test_extract_entity_attribute_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.extract_entity_attribute_mapper import ExtractEntityAttributeMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class ExtractEntityAttributeMapperTest(DataJuicerTestCaseBase):
 
 
diff --git a/tests/ops/mapper/test_extract_entity_relation_mapper.py b/tests/ops/mapper/test_extract_entity_relation_mapper.py
index 053881e24..43b5ac809 100644
--- a/tests/ops/mapper/test_extract_entity_relation_mapper.py
+++ b/tests/ops/mapper/test_extract_entity_relation_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.extract_entity_relation_mapper import ExtractEntityRelationMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class ExtractEntityRelationMapperTest(DataJuicerTestCaseBase):
 
 
diff --git a/tests/ops/mapper/test_extract_event_mapper.py b/tests/ops/mapper/test_extract_event_mapper.py
index 8da2caf8a..58440a434 100644
--- a/tests/ops/mapper/test_extract_event_mapper.py
+++ b/tests/ops/mapper/test_extract_event_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.extract_event_mapper import ExtractEventMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class ExtractEventMapperTest(DataJuicerTestCaseBase):
 
 
diff --git a/tests/ops/mapper/test_extract_keyword_mapper.py b/tests/ops/mapper/test_extract_keyword_mapper.py
index 47b30d687..54e29d74a 100644
--- a/tests/ops/mapper/test_extract_keyword_mapper.py
+++ b/tests/ops/mapper/test_extract_keyword_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.extract_keyword_mapper import ExtractKeywordMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class ExtractKeywordMapperTest(DataJuicerTestCaseBase):
 
 
diff --git a/tests/ops/mapper/test_extract_nickname_mapper.py b/tests/ops/mapper/test_extract_nickname_mapper.py
index df204c13e..4e66056cc 100644
--- a/tests/ops/mapper/test_extract_nickname_mapper.py
+++ b/tests/ops/mapper/test_extract_nickname_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.extract_nickname_mapper import ExtractNicknameMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class ExtractNicknameMapperTest(DataJuicerTestCaseBase):
 
 
diff --git a/tests/ops/mapper/test_extract_support_text_mapper.py b/tests/ops/mapper/test_extract_support_text_mapper.py
index 4ee2652c2..8758608a7 100644
--- a/tests/ops/mapper/test_extract_support_text_mapper.py
+++ b/tests/ops/mapper/test_extract_support_text_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.extract_support_text_mapper import ExtractSupportTextMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class ExtractSupportTextMapperTest(DataJuicerTestCaseBase):
 
 
diff --git a/tests/ops/mapper/test_pair_preference_mapper.py b/tests/ops/mapper/test_pair_preference_mapper.py
index 93cd4d877..9c3b213e7 100644
--- a/tests/ops/mapper/test_pair_preference_mapper.py
+++ b/tests/ops/mapper/test_pair_preference_mapper.py
@@ -3,13 +3,9 @@
 from loguru import logger
 
 from data_juicer.ops.mapper.pair_preference_mapper import PairPreferenceMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-# Skip tests for this OP because the API call is not configured yet.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class PairPreferenceMapperTest(DataJuicerTestCaseBase):
 
     def _run_op(self, op, samples):
diff --git a/tests/ops/mapper/test_relation_identity_mapper.py b/tests/ops/mapper/test_relation_identity_mapper.py
index 57382f988..674fb11be 100644
--- a/tests/ops/mapper/test_relation_identity_mapper.py
+++ b/tests/ops/mapper/test_relation_identity_mapper.py
@@ -1,17 +1,12 @@
 import unittest
-import json
 
 from loguru import logger
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.relation_identity_mapper import RelationIdentityMapper
-from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
-                                              DataJuicerTestCaseBase)
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 from data_juicer.utils.constant import Fields, MetaKeys
 
-# Skip tests for this OP.
-# These tests have been tested locally.
-@SKIPPED_TESTS.register_module()
 class RelationIdentityMapperTest(DataJuicerTestCaseBase):
 
     # before runing this test, set below environment variables:
diff --git a/tests/ops/mapper/test_text_chunk_mapper.py b/tests/ops/mapper/test_text_chunk_mapper.py
index a412104c4..94ee24caf 100644
--- a/tests/ops/mapper/test_text_chunk_mapper.py
+++ b/tests/ops/mapper/test_text_chunk_mapper.py
@@ -2,10 +2,9 @@
 
 from data_juicer.core.data import NestedDataset as Dataset
 from data_juicer.ops.mapper.text_chunk_mapper import TextChunkMapper
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
 
-@SKIPPED_TESTS.register_module()
 class TextChunkMapperTest(DataJuicerTestCaseBase):
 
     def _run_helper(self, op, samples, target):

From f52106dc7001bbae38342e5ad80938e276f00923 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Tue, 21 Jan 2025 16:24:16 +0800
Subject: [PATCH 25/26] * fix wrong attr name

---
 tests/ops/aggregator/test_entity_attribute_aggregator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ops/aggregator/test_entity_attribute_aggregator.py b/tests/ops/aggregator/test_entity_attribute_aggregator.py
index 68b36b0dc..b1599ca8c 100644
--- a/tests/ops/aggregator/test_entity_attribute_aggregator.py
+++ b/tests/ops/aggregator/test_entity_attribute_aggregator.py
@@ -23,7 +23,7 @@ def _run_helper(self, op, samples, output_key=BatchMetaKeys.entity_attribute):
             for k in data:
                 logger.info(f"{k}: {data[k]}")
             self.assertIn(output_key, data[Fields.batch_meta])
-            self.assertNotEqual(data[Fields.batch_met][output_key], '')
+            self.assertNotEqual(data[Fields.batch_meta][output_key], '')
 
         self.assertEqual(len(new_dataset), len(samples))
 

From d2b00646269ed84a7f677b2ad76ee9b9066894fc Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Tue, 21 Jan 2025 22:11:37 +0800
Subject: [PATCH 26/26] * increase shm_size to avoid OOM

---
 .github/workflows/docker/docker-compose.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker/docker-compose.yml b/.github/workflows/docker/docker-compose.yml
index 61c2c84a5..2f5f07a48 100644
--- a/.github/workflows/docker/docker-compose.yml
+++ b/.github/workflows/docker/docker-compose.yml
@@ -21,7 +21,7 @@ services:
     ports:
       - "6379:6379"
       - "8265:8265"
-    shm_size: "64G"
+    shm_size: "128G"
     deploy:
       resources:
         reservations:
@@ -49,7 +49,7 @@ services:
       - ray-head
     networks:
       - ray-network
-    shm_size: "64G"
+    shm_size: "128G"
     deploy:
       resources:
         reservations: