Skip to content

Commit

Permalink
Merge branch 'modelscope:main' into segment_mapper
Browse files Browse the repository at this point in the history
  • Loading branch information
Qirui-jiao authored Nov 29, 2024
2 parents f615869 + 6766316 commit 121f4d3
Show file tree
Hide file tree
Showing 22 changed files with 267 additions and 60 deletions.
10 changes: 3 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ RUN apt-get update \

# install 3rd-party system dependencies
RUN apt-get update \
&& apt-get install ffmpeg libsm6 libxext6 software-properties-common build-essential cmake -y
&& apt-get install ffmpeg libsm6 libxext6 software-properties-common build-essential cmake gfortran libopenblas-dev liblapack-dev -y

# prepare the java env
WORKDIR /opt
Expand All @@ -33,11 +33,7 @@ WORKDIR /data-juicer
RUN pip install --upgrade setuptools==69.5.1 setuptools_scm \
&& pip install git+https://github.com/xinyu1205/recognize-anything.git --default-timeout 1000

# install requirements first to better reuse installed library cache
COPY environments/ environments/
RUN cat environments/* | grep -v '^#' | xargs pip install --default-timeout 1000

# install data-juicer then
COPY . .
RUN pip install -v -e .[all]
RUN pip install -v -e .[sandbox]
RUN pip install -v -e .[all] --default-timeout 1000
RUN pip install -v -e .[sandbox] --default-timeout 1000
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ Table of Contents

## Prerequisites

- Recommend Python>=3.8,<=3.10
- Recommend Python>=3.9,<=3.10
- gcc >= 5 (at least C++14 support)

## Installation
Expand Down Expand Up @@ -386,6 +386,10 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml
```shell
# run the data processing directly
docker run --rm \ # remove container after the processing
--privileged \
--shm-size 256g \
--network host \
--gpus all \
--name dj \ # name of the container
-v <host_data_path>:<image_data_path> \ # mount data or config directory into the container
-v ~/.cache/:/root/.cache/ \ # mount the cache directory into the container to reuse caches and models (recommended)
Expand All @@ -398,6 +402,10 @@ docker run --rm \ # remove container after the processing
```shell
# start the container
docker run -dit \ # run the container in the background
--privileged \
--shm-size 256g \
--network host \
--gpus all \
--rm \
--name dj \
-v <host_data_path>:<image_data_path> \
Expand Down
10 changes: 9 additions & 1 deletion README_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ Data-Juicer正在积极更新和维护中,我们将定期强化和新增更多

## 前置条件

* 推荐 Python>=3.8,<=3.10
* 推荐 Python>=3.9,<=3.10
* gcc >= 5 (at least C++14 support)

## 安装
Expand Down Expand Up @@ -363,6 +363,10 @@ python tools/sandbox_starter.py --config configs/demo/sandbox/sandbox.yaml
```shell
# 直接运行数据处理
docker run --rm \ # 在处理结束后将容器移除
--privileged \
--shm-size 256g \
--network host \
--gpus all \
--name dj \ # 容器名称
-v <host_data_path>:<image_data_path> \ # 将本地的数据或者配置目录挂载到容器中
-v ~/.cache/:/root/.cache/ \ # 将 cache 目录挂载到容器以复用 cache 和模型资源(推荐)
Expand All @@ -375,6 +379,10 @@ docker run --rm \ # 在处理结束后将容器移除
```shell
# 启动容器
docker run -dit \ # 在后台启动容器
--privileged \
--shm-size 256g \
--network host \
--gpus all \
--rm \
--name dj \
-v <host_data_path>:<image_data_path> \
Expand Down
12 changes: 12 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,18 @@ process:
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
- optimize_query_mapper: # optimize query in question-answer pairs.
- optimize_response_mapper: # optimize response in question-answer pairs.
- pair_preference_mapper: # construct paired preference samples.
api_model: 'gpt-4o' # API model name.
api_endpoint: null # URL endpoint for the API.
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
system_prompt: null # System prompt for guiding the generation task.
input_template: null # Template for building the model input.
output_pattern: null # Regular expression for parsing model output.
rejected_key: 'rejected_response' # The field name in the sample to store the generated rejected response.
reason_key: 'reason' # The field name in the sample to store the reason for generating the response.
try_num: 3 # The number of retries for the API call in case of response parsing failure.
model_params: {} # Parameters for initializing the API model.
sampling_params: {} # Extra parameters passed to the API call.
- punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations.
- remove_bibliography_mapper: # remove bibliography from Latex text.
- remove_comments_mapper: # remove comments from Latex text, code, etc.
Expand Down
28 changes: 15 additions & 13 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from .optimize_qa_mapper import OptimizeQAMapper
from .optimize_query_mapper import OptimizeQueryMapper
from .optimize_response_mapper import OptimizeResponseMapper
from .pair_preference_mapper import PairPreferenceMapper
from .punctuation_normalization_mapper import PunctuationNormalizationMapper
from .remove_bibliography_mapper import RemoveBibliographyMapper
from .remove_comments_mapper import RemoveCommentsMapper
Expand Down Expand Up @@ -74,17 +75,18 @@
'ImageCaptioningMapper', 'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageSegmentMapper',
'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper',
'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper',
'PunctuationNormalizationMapper', 'RemoveBibliographyMapper',
'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper',
'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
'SentenceSplitMapper', 'TextChunkMapper', 'VideoCaptioningFromAudioMapper',
'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper',
'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper',
'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper',
'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper'
'PairPreferenceMapper', 'PunctuationNormalizationMapper',
'RemoveBibliographyMapper', 'RemoveCommentsMapper', 'RemoveHeaderMapper',
'RemoveLongWordsMapper', 'RemoveNonChineseCharacterlMapper',
'RemoveRepeatSentencesMapper', 'RemoveSpecificCharsMapper',
'RemoveTableTextMapper', 'RemoveWordsWithIncorrectSubstringsMapper',
'ReplaceContentMapper', 'SentenceSplitMapper', 'TextChunkMapper',
'VideoCaptioningFromAudioMapper', 'VideoCaptioningFromFramesMapper',
'VideoCaptioningFromSummarizerMapper', 'VideoCaptioningFromVideoMapper',
'VideoFFmpegWrappedMapper', 'VideoFaceBlurMapper',
'VideoRemoveWatermarkMapper', 'VideoResizeAspectRatioMapper',
'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper',
'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper',
'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper',
'WhitespaceNormalizationMapper'
]
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/calibrate_qa_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
from loguru import logger
from pydantic import PositiveInt

from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.utils.model_utils import get_model, prepare_model

OP_NAME = 'calibrate_qa_mapper'


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class CalibrateQAMapper(Mapper):
"""
Expand Down Expand Up @@ -107,7 +106,7 @@ def process_single(self, sample, rank=None):
'content': self.build_input(sample)
}]
parsed_q, parsed_a = None, None
for i in range(self.try_num):
for _ in range(self.try_num):
try:
output = client(messages, **self.sampling_params)
parsed_q, parsed_a = self.parse_output(output)
Expand Down
3 changes: 1 addition & 2 deletions data_juicer/ops/mapper/calibrate_query_mapper.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
from data_juicer.ops.base_op import OPERATORS
from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper

OP_NAME = 'calibrate_query_mapper'


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class CalibrateQueryMapper(CalibrateQAMapper):
"""
Expand Down
3 changes: 1 addition & 2 deletions data_juicer/ops/mapper/calibrate_response_mapper.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
from data_juicer.ops.base_op import OPERATORS
from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper

OP_NAME = 'calibrate_response_mapper'


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class CalibrateResponseMapper(CalibrateQAMapper):
"""
Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/extract_entity_attribute_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
from loguru import logger
from pydantic import PositiveInt

from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.model_utils import get_model, prepare_model

OP_NAME = 'extract_entity_attribute_mapper'


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class ExtractEntityAttributeMapper(Mapper):
"""
Expand Down Expand Up @@ -154,7 +153,7 @@ def _process_single_sample(self, text='', rank=None):
}]

desc, demos = '', []
for i in range(self.try_num):
for _ in range(self.try_num):
try:
output = client(messages, **self.sampling_params)
desc, demos = self.parse_output(output, attribute)
Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/extract_entity_relation_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from loguru import logger
from pydantic import NonNegativeInt, PositiveInt

from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.utils.common_utils import is_float
from data_juicer.utils.constant import Fields
from data_juicer.utils.model_utils import get_model, prepare_model
Expand All @@ -20,7 +20,6 @@


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class ExtractEntityRelationMapper(Mapper):
"""
Expand Down Expand Up @@ -319,7 +318,7 @@ def process_single(self, sample, rank=None):
messages = [{'role': 'user', 'content': input_prompt}]

entities, relations = [], []
for i in range(self.try_num):
for _ in range(self.try_num):
try:
result = self.light_rag_extraction(messages, rank=rank)
entities, relations = self.parse_output(result)
Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/extract_event_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from loguru import logger
from pydantic import PositiveInt

from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.model_utils import get_model, prepare_model

Expand All @@ -15,7 +15,6 @@


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class ExtractEventMapper(Mapper):
"""
Expand Down Expand Up @@ -134,7 +133,7 @@ def _process_single_sample(self, text='', rank=None):
}]

event_list, character_list = [], []
for i in range(self.try_num):
for _ in range(self.try_num):
try:
output = client(messages, **self.sampling_params)
event_list, character_list = self.parse_output(output)
Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/extract_keyword_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from loguru import logger
from pydantic import PositiveInt

from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.model_utils import get_model, prepare_model

Expand All @@ -16,7 +16,6 @@


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class ExtractKeywordMapper(Mapper):
"""
Expand Down Expand Up @@ -173,7 +172,7 @@ def process_single(self, sample, rank=None):
messages = [{'role': 'user', 'content': input_prompt}]

keywords = []
for i in range(self.try_num):
for _ in range(self.try_num):
try:
result = client(messages, **self.sampling_params)
keywords = self.parse_output(result)
Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/extract_nickname_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
from loguru import logger
from pydantic import PositiveInt

from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.model_utils import get_model, prepare_model

OP_NAME = 'extract_nickname_mapper'


# TODO: LLM-based inference.
@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class ExtractNicknameMapper(Mapper):
"""
Expand Down Expand Up @@ -143,7 +142,7 @@ def process_single(self, sample, rank=None):
'content': input_prompt
}]
nickname_relations = []
for i in range(self.try_num):
for _ in range(self.try_num):
try:
output = client(messages, **self.sampling_params)
nickname_relations = self.parse_output(output)
Expand Down
Loading

0 comments on commit 121f4d3

Please sign in to comment.