Merge branch 'main' into feat/insight_mining

# Conflicts: # data_juicer/ops/__init__.py # data_juicer/ops/base_op.py
modelscope · Dec 19, 2024 · 3ca9994 · 3ca9994
2 parents e3d7b8b + b4811a0
commit 3ca9994
Show file tree

Hide file tree

Showing 54 changed files with 2,632 additions and 350 deletions.
diff --git a/README.md b/README.md
@@ -197,6 +197,22 @@ The dependency options are listed below:
 | `.[tools]`       | Install dependencies for dedicated tools, such as quality classifiers.                       |
 | `.[sandbox]`     | Install all dependencies for sandbox.                                                        |
 
+- Install dependencies for specific OPs
+
+With the growth of the number of OPs, the dependencies of all OPs becomes very heavy. Instead of using the command `pip install -v -e .[sci]` to install all dependencies,
+we provide two alternative, lighter options:
+
+  - Automatic Minimal Dependency Installation: During the execution of Data-Juicer, minimal dependencies will be automatically installed. This allows for immediate execution, but may potentially lead to dependency conflicts.
+
+  - Manual Minimal Dependency Installation: To manually install minimal dependencies tailored to a specific execution configuration, run the following command:
+    ```shell
+    # only for installation from source
+    python tools/dj_install.py --config path_to_your_data-juicer_config_file
+
+    # use command line tool
+    dj-install --config path_to_your_data-juicer_config_file
+    ```
+
 ### Using pip
 
 - Run the following command to install the latest released `data_juicer` using `pip`:

diff --git a/README_ZH.md b/README_ZH.md
@@ -178,6 +178,21 @@ pip install -v -e .[tools] # 安装部分工具库的依赖
 | `.[tools]`       | 安装专用工具库（如质量分类器）所需的依赖项        |
 | `.[sandbox]`     | 安装沙盒实验室的基础依赖                 |
 
+* 只安装部分算子依赖
+
+随着OP数量的增长，所有OP的依赖变得很重。为此，我们提供了两个替代的、更轻量的选项，作为使用命令`pip install -v -e .[sci]`安装所有依赖的替代：
+
+  * 自动最小依赖安装：在执行Data-Juicer的过程中，将自动安装最小依赖。也就是说你可以直接执行，但这种方式可能会导致一些依赖冲突。
+
+  * 手动最小依赖安装：可以通过如下指令手动安装适合特定执行配置的最小依赖：
+    ```shell
+    # 适用于从源码安装
+    python tools/dj_install.py --config path_to_your_data-juicer_config_file
+
+    # 使用命令行工具
+    dj-install --config path_to_your_data-juicer_config_file
+    ```
+
 ### 使用 pip 安装
 
 * 运行以下命令用 `pip` 安装 `data_juicer` 的最新发布版本：

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -79,9 +79,9 @@ process:
   - clean_copyright_mapper:                                 # remove copyright comments.
   - expand_macro_mapper:                                    # expand macro definitions in Latex text.
   - extract_entity_attribute_mapper:                        # Extract attributes for given entities from the text.
+      api_model: 'gpt-4o'                                     # API model name.
       query_entities: ["孙悟空", "猪八戒"]                      # Entity list to be queried.
       query_attributes: ["人物性格"]                            # Attribute list to be queried.
-      api_model: 'gpt-4o'                                     # API model name.
       entity_key: '__dj__entity__'                            # The field name to store the given main entity for attribute extraction.
       entity_attribute_key: '__dj__attribute__'               # The field name to store the given attribute to be extracted.
       attribute_desc_key: '__dj__attribute_description__'     # The field name to store the extracted attribute description.
@@ -153,6 +153,18 @@ process:
       drop_text: false                                        # If drop the text in the output.
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - extract_support_text_mapper:                            # extract support sub text for a summary.
+      api_model: 'gpt-4o'                                     # API model name.
+      summary_key: '__dj__event_description__'                # The field name to store the input summary. Support for nested keys such as "__dj__stats__.text_len".
+      support_text_key: '__dj__support_text__'                # The field name to store the output support text for the summary.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt: null                                     # System prompt for the task.
+      input_template: null                                    # Template for building the model input.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      drop_text: false                                        # If drop the text in the output.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
   - fix_unicode_mapper:                                     # fix unicode errors in text.
   - generate_qa_from_examples_mapper:                       # mapper to generate question and answer pairs from examples.
       hf_model: 'Qwen/Qwen2.5-7B-Instruct'                    # Model name on huggingface to generate question and answer pairs.
@@ -259,12 +271,27 @@ process:
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call.
   - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
-  - python_python_mapper:                                   # executing Python lambda function defined in a file.
+  - python_file_mapper:                                   # executing Python lambda function defined in a file.
       file_path: ''                                           # The path to the Python file containing the function to be executed.
       function_name: 'process_single'                         # The name of the function defined in the file to be executed.
   - python_lambda_mapper:                                   # executing Python lambda function on data samples.
       lambda_str: ''                                          # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
       batched: False                                          # A boolean indicating whether to process input data in batches.
+  - relation_identity_mapper:                               # identify relation between two entity in the text.
+      api_model: 'gpt-4o'                                     # API model name.
+      source_entity: '孙悟空'                                  # The source entity of the relation to be dentified.
+      target_entity: '猪八戒'                                  # The target entity of the relation to be identified.
+      input_key: null                                         # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
+      output_key: null                                        # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is input_key in default.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt_template: null                            # System prompt template for the task. Need to specify by entity1 and entity2.
+      input_template: null                                    # Template for building the model input.
+      output_pattern_template: null                           # Regular expression template for parsing model output.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      drop_text: false                                        # If drop the text in the output.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
   - remove_bibliography_mapper:                             # remove bibliography from Latex text.
   - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.
       doc_type: tex                                           # comment type you want to remove. Only support 'tex' for now.
@@ -693,3 +720,55 @@ process:
       top_ratio:                                              # ratio of selected top samples
       topk:                                                   # number of selected top sample
       reverse: True                                           # determine the sorting rule, if reverse=True, then sort in descending order
+
+# Grouper ops.
+  - naive_grouper:                                          # Group all samples to one batched sample.
+  - key_value_grouper:                                      # Group samples to batched samples according values in given keys.
+      group_by_keys: null                                     # Group samples according values in the keys. Support for nested keys such as "__dj__stats__.text_len". It is [self.text_key] in default.
+
+# Aggregator ops.
+  - entity_attribute_aggregator:                            # Return conclusion of the given entity's attribute from some docs.
+      api_model: 'gpt-4o'                                     # API model name.
+      entity: '孙悟空'                                         # The given entity.
+      attribute: '人物经历'                                    # The given attribute.
+      input_key: null                                         # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
+      output_key: null                                        # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
+      word_limit: 100                                         # Prompt the output length.
+      max_token_num: null                                     # The max token num of the total tokens of the sub documents. Without limitation if it is None.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt_template: null                            # System prompt template for the task. Need to be specified by given entity and attribute.
+      example_prompt: null                                    # The example part in the system prompt.
+      input_template: null                                    # The input template.
+      output_pattern_template: null                           # The output template.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - most_relavant_entities_aggregator:                      # Extract entities closely related to a given entity from some texts, and sort them in descending order of importance.
+      api_model: 'gpt-4o'                                     # API model name.
+      entity: '孙悟空'                                         # The given entity.
+      query_entity_type: '人物'                                # The type of queried relavant entities.
+      input_key: null                                         # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
+      output_key: null                                        # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
+      max_token_num: null                                     # The max token num of the total tokens of the sub documents. Without limitation if it is None.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt_template: null                            # System prompt template for the task. Need to be specified by given entity and entity_type.
+      input_template: null                                    # The input template.
+      output_pattern: null                                    # The output pattern.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - nested_aggregator:                                      # Considering the limitation of input length, nested aggregate contents for each given number of samples.
+      api_model: 'gpt-4o'                                     # API model name.
+      input_key: null                                         # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
+      output_key: null                                        # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
+      max_token_num: null                                     # The max token num of the total tokens of the sub documents. Without limitation if it is None.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt: null                                     # The system prompt.
+      sub_doc_template: null                                  # The template for input text in each sample.
+      input_template: null                                    # The input template.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -622,8 +622,13 @@ def sort_op_by_types_and_names(op_name_classes):
                         if 'deduplicator' in name]
     selector_ops = [(name, c) for (name, c) in op_name_classes
                     if 'selector' in name]
+    grouper_ops = [(name, c) for (name, c) in op_name_classes
+                   if 'grouper' in name]
+    aggregator_ops = [(name, c) for (name, c) in op_name_classes
+                      if 'aggregator' in name]
     ops_sorted_by_types = sorted(mapper_ops) + sorted(filter_ops) + sorted(
-        deduplicator_ops) + sorted(selector_ops)
+        deduplicator_ops) + sorted(selector_ops) + sorted(grouper_ops) + \
+        sorted(aggregator_ops)
     return ops_sorted_by_types