Merge branch 'modelscope:main' into segment_mapper

Qirui-jiao · Nov 29, 2024 · 121f4d3 · 121f4d3
2 parents f615869 + 6766316
commit 121f4d3
Show file tree

Hide file tree

Showing 22 changed files with 267 additions and 60 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,7 @@ RUN apt-get update \
 
 # install 3rd-party system dependencies
 RUN apt-get update \
-    && apt-get install ffmpeg libsm6 libxext6 software-properties-common build-essential cmake  -y
+    && apt-get install ffmpeg libsm6 libxext6 software-properties-common build-essential cmake gfortran libopenblas-dev liblapack-dev -y
 
 # prepare the java env
 WORKDIR /opt
@@ -33,11 +33,7 @@ WORKDIR /data-juicer
 RUN pip install --upgrade setuptools==69.5.1 setuptools_scm \
     && pip install git+https://github.com/xinyu1205/recognize-anything.git --default-timeout 1000
 
-# install requirements first to better reuse installed library cache
-COPY environments/ environments/
-RUN cat environments/* | grep -v '^#' | xargs pip install --default-timeout 1000
-
 # install data-juicer then
 COPY . .
-RUN pip install -v -e .[all]
-RUN pip install -v -e .[sandbox]
+RUN pip install -v -e .[all] --default-timeout 1000
+RUN pip install -v -e .[sandbox] --default-timeout 1000
diff --git a/README.md b/README.md
@@ -163,7 +163,7 @@ Table of Contents
 
 ## Prerequisites
 
-- Recommend Python>=3.8,<=3.10
+- Recommend Python>=3.9,<=3.10
 - gcc >= 5 (at least C++14 support)
 
 ## Installation
@@ -386,6 +386,10 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml
 ```shell
 # run the data processing directly
 docker run --rm \  # remove container after the processing
+  --privileged \
+  --shm-size 256g \
+  --network host \
+  --gpus all \
   --name dj \  # name of the container
   -v <host_data_path>:<image_data_path> \  # mount data or config directory into the container
   -v ~/.cache/:/root/.cache/ \  # mount the cache directory into the container to reuse caches and models (recommended)
@@ -398,6 +402,10 @@ docker run --rm \  # remove container after the processing
 ```shell
 # start the container
 docker run -dit \  # run the container in the background
+  --privileged \
+  --shm-size 256g \
+  --network host \
+  --gpus all \
   --rm \
   --name dj \
   -v <host_data_path>:<image_data_path> \

diff --git a/README_ZH.md b/README_ZH.md
@@ -144,7 +144,7 @@ Data-Juicer正在积极更新和维护中，我们将定期强化和新增更多
 
 ## 前置条件
 
-* 推荐 Python>=3.8,<=3.10
+* 推荐 Python>=3.9,<=3.10
 * gcc >= 5 (at least C++14 support)
 
 ## 安装
@@ -363,6 +363,10 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml
 ```shell
 # 直接运行数据处理
 docker run --rm \  # 在处理结束后将容器移除
+  --privileged \
+  --shm-size 256g \
+  --network host \
+  --gpus all \
   --name dj \  # 容器名称
   -v <host_data_path>:<image_data_path> \  # 将本地的数据或者配置目录挂载到容器中
   -v ~/.cache/:/root/.cache/ \  # 将 cache 目录挂载到容器以复用 cache 和模型资源（推荐）
@@ -375,6 +379,10 @@ docker run --rm \  # 在处理结束后将容器移除
 ```shell
 # 启动容器
 docker run -dit \  # 在后台启动容器
+  --privileged \
+  --shm-size 256g \
+  --network host \
+  --gpus all \
   --rm \
   --name dj \
   -v <host_data_path>:<image_data_path> \

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -248,6 +248,18 @@ process:
       sampling_params: {}                                     # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
   - optimize_query_mapper:                                  # optimize query in question-answer pairs.
   - optimize_response_mapper:                               # optimize response in question-answer pairs.
+  - pair_preference_mapper:                                 # construct paired preference samples.
+      api_model: 'gpt-4o'                                     # API model name.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt: null                                     # System prompt for guiding the generation task.
+      input_template: null                                    # Template for building the model input.
+      output_pattern: null                                    # Regular expression for parsing model output.
+      rejected_key: 'rejected_response'                       # The field name in the sample to store the generated rejected response.
+      reason_key: 'reason'                                    # The field name in the sample to store the reason for generating the response.
+      try_num: 3                                              # The number of retries for the API call in case of response parsing failure.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call.
   - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
   - remove_bibliography_mapper:                             # remove bibliography from Latex text.
   - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -29,6 +29,7 @@
 from .optimize_qa_mapper import OptimizeQAMapper
 from .optimize_query_mapper import OptimizeQueryMapper
 from .optimize_response_mapper import OptimizeResponseMapper
+from .pair_preference_mapper import PairPreferenceMapper
 from .punctuation_normalization_mapper import PunctuationNormalizationMapper
 from .remove_bibliography_mapper import RemoveBibliographyMapper
 from .remove_comments_mapper import RemoveCommentsMapper
@@ -74,17 +75,18 @@
     'ImageCaptioningMapper', 'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageSegmentMapper',
     'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper',
     'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper',
-    'PunctuationNormalizationMapper', 'RemoveBibliographyMapper',
-    'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper',
-    'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
-    'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
-    'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
-    'SentenceSplitMapper', 'TextChunkMapper', 'VideoCaptioningFromAudioMapper',
-    'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
-    'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
-    'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
-    'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper',
-    'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper',
-    'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper',
-    'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper'
+    'PairPreferenceMapper', 'PunctuationNormalizationMapper',
+    'RemoveBibliographyMapper', 'RemoveCommentsMapper', 'RemoveHeaderMapper',
+    'RemoveLongWordsMapper', 'RemoveNonChineseCharacterlMapper',
+    'RemoveRepeatSentencesMapper', 'RemoveSpecificCharsMapper',
+    'RemoveTableTextMapper', 'RemoveWordsWithIncorrectSubstringsMapper',
+    'ReplaceContentMapper', 'SentenceSplitMapper', 'TextChunkMapper',
+    'VideoCaptioningFromAudioMapper', 'VideoCaptioningFromFramesMapper',
+    'VideoCaptioningFromSummarizerMapper', 'VideoCaptioningFromVideoMapper',
+    'VideoFFmpegWrappedMapper', 'VideoFaceBlurMapper',
+    'VideoRemoveWatermarkMapper', 'VideoResizeAspectRatioMapper',
+    'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper',
+    'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper',
+    'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper',
+    'WhitespaceNormalizationMapper'
 ]
diff --git a/data_juicer/ops/mapper/calibrate_qa_mapper.py b/data_juicer/ops/mapper/calibrate_qa_mapper.py
@@ -4,14 +4,13 @@
 from loguru import logger
 from pydantic import PositiveInt
 
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
+from data_juicer.ops.base_op import OPERATORS, Mapper
 from data_juicer.utils.model_utils import get_model, prepare_model
 
 OP_NAME = 'calibrate_qa_mapper'
 
 
 # TODO: LLM-based inference.
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class CalibrateQAMapper(Mapper):
     """
@@ -107,7 +106,7 @@ def process_single(self, sample, rank=None):
             'content': self.build_input(sample)
         }]
         parsed_q, parsed_a = None, None
-        for i in range(self.try_num):
+        for _ in range(self.try_num):
             try:
                 output = client(messages, **self.sampling_params)
                 parsed_q, parsed_a = self.parse_output(output)

diff --git a/data_juicer/ops/mapper/calibrate_query_mapper.py b/data_juicer/ops/mapper/calibrate_query_mapper.py
@@ -1,11 +1,10 @@
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
+from data_juicer.ops.base_op import OPERATORS
 from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
 
 OP_NAME = 'calibrate_query_mapper'
 
 
 # TODO: LLM-based inference.
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class CalibrateQueryMapper(CalibrateQAMapper):
     """

diff --git a/data_juicer/ops/mapper/calibrate_response_mapper.py b/data_juicer/ops/mapper/calibrate_response_mapper.py
@@ -1,11 +1,10 @@
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
+from data_juicer.ops.base_op import OPERATORS
 from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
 
 OP_NAME = 'calibrate_response_mapper'
 
 
 # TODO: LLM-based inference.
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class CalibrateResponseMapper(CalibrateQAMapper):
     """

diff --git a/data_juicer/ops/mapper/extract_entity_attribute_mapper.py b/data_juicer/ops/mapper/extract_entity_attribute_mapper.py
@@ -5,15 +5,14 @@
 from loguru import logger
 from pydantic import PositiveInt
 
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
+from data_juicer.ops.base_op import OPERATORS, Mapper
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.model_utils import get_model, prepare_model
 
 OP_NAME = 'extract_entity_attribute_mapper'
 
 
 # TODO: LLM-based inference.
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class ExtractEntityAttributeMapper(Mapper):
     """
@@ -154,7 +153,7 @@ def _process_single_sample(self, text='', rank=None):
                 }]
 
                 desc, demos = '', []
-                for i in range(self.try_num):
+                for _ in range(self.try_num):
                     try:
                         output = client(messages, **self.sampling_params)
                         desc, demos = self.parse_output(output, attribute)

diff --git a/data_juicer/ops/mapper/extract_entity_relation_mapper.py b/data_juicer/ops/mapper/extract_entity_relation_mapper.py
@@ -9,7 +9,7 @@
 from loguru import logger
 from pydantic import NonNegativeInt, PositiveInt
 
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
+from data_juicer.ops.base_op import OPERATORS, Mapper
 from data_juicer.utils.common_utils import is_float
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.model_utils import get_model, prepare_model
@@ -20,7 +20,6 @@
 
 
 # TODO: LLM-based inference.
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class ExtractEntityRelationMapper(Mapper):
     """
@@ -319,7 +318,7 @@ def process_single(self, sample, rank=None):
         messages = [{'role': 'user', 'content': input_prompt}]
 
         entities, relations = [], []
-        for i in range(self.try_num):
+        for _ in range(self.try_num):
             try:
                 result = self.light_rag_extraction(messages, rank=rank)
                 entities, relations = self.parse_output(result)

diff --git a/data_juicer/ops/mapper/extract_event_mapper.py b/data_juicer/ops/mapper/extract_event_mapper.py
@@ -5,7 +5,7 @@
 from loguru import logger
 from pydantic import PositiveInt
 
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
+from data_juicer.ops.base_op import OPERATORS, Mapper
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.model_utils import get_model, prepare_model
 
@@ -15,7 +15,6 @@
 
 
 # TODO: LLM-based inference.
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class ExtractEventMapper(Mapper):
     """
@@ -134,7 +133,7 @@ def _process_single_sample(self, text='', rank=None):
         }]
 
         event_list, character_list = [], []
-        for i in range(self.try_num):
+        for _ in range(self.try_num):
             try:
                 output = client(messages, **self.sampling_params)
                 event_list, character_list = self.parse_output(output)

diff --git a/data_juicer/ops/mapper/extract_keyword_mapper.py b/data_juicer/ops/mapper/extract_keyword_mapper.py
@@ -6,7 +6,7 @@
 from loguru import logger
 from pydantic import PositiveInt
 
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
+from data_juicer.ops.base_op import OPERATORS, Mapper
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.model_utils import get_model, prepare_model
 
@@ -16,7 +16,6 @@
 
 
 # TODO: LLM-based inference.
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class ExtractKeywordMapper(Mapper):
     """
@@ -173,7 +172,7 @@ def process_single(self, sample, rank=None):
         messages = [{'role': 'user', 'content': input_prompt}]
 
         keywords = []
-        for i in range(self.try_num):
+        for _ in range(self.try_num):
             try:
                 result = client(messages, **self.sampling_params)
                 keywords = self.parse_output(result)

diff --git a/data_juicer/ops/mapper/extract_nickname_mapper.py b/data_juicer/ops/mapper/extract_nickname_mapper.py
@@ -4,15 +4,14 @@
 from loguru import logger
 from pydantic import PositiveInt
 
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
+from data_juicer.ops.base_op import OPERATORS, Mapper
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.model_utils import get_model, prepare_model
 
 OP_NAME = 'extract_nickname_mapper'
 
 
 # TODO: LLM-based inference.
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class ExtractNicknameMapper(Mapper):
     """
@@ -143,7 +142,7 @@ def process_single(self, sample, rank=None):
             'content': input_prompt
         }]
         nickname_relations = []
-        for i in range(self.try_num):
+        for _ in range(self.try_num):
             try:
                 output = client(messages, **self.sampling_params)
                 nickname_relations = self.parse_output(output)