Merge branch 'main' into feat/ft_format_conv_tools

modelscope · Dec 23, 2024 · d6148dd · d6148dd
2 parents 38b5619 + a26dcc7
commit d6148dd
Show file tree

Hide file tree

Showing 90 changed files with 3,215 additions and 404 deletions.
diff --git a/README.md b/README.md
@@ -333,9 +333,16 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml
 
 # use command line tool
 dj-analyze --config configs/demo/analyzer.yaml
+
+# you can also use auto mode to avoid writing a recipe. It will analyze a small
+# part (e.g. 1000 samples, specified by argument `auto_num`) of your dataset 
+# with all Filters that produce stats.
+dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000]
 ```
 
-- **Note:** Analyzer only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process.
+- **Note:** Analyzer only compute stats for Filters that produce stats or other OPs that produce tags/categories in meta. So other OPs will be ignored in the analysis process. We use the following registries to decorate OPs:
+  - `NON_STATS_FILTERS`: decorate Filters that **DO NOT** produce any stats.
+  - `TAGGING_OPS`: decorate OPs that **DO** produce tags/categories in meta field.
 
 ### Data Visualization
 

diff --git a/README_ZH.md b/README_ZH.md
@@ -310,9 +310,15 @@ python tools/analyze_data.py --config configs/demo/analyzer.yaml
 
 # 使用命令行工具
 dj-analyze --config configs/demo/analyzer.yaml
+
+# 你也可以使用"自动"模式来避免写一个新的数据菜谱。它会使用全部可产出统计信息的 Filter 来分析
+# 你的数据集的一小部分（如1000条样本，可通过 `auto_num` 参数指定）
+dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000]
 ```
 
-* **注意**：Analyzer 只计算 Filter 算子的状态，其他的算子（例如 Mapper 和 Deduplicator）会在分析过程中被忽略。
+* **注意**：Analyzer 只用于能在 stats 字段里产出统计信息的 Filter 算子和能在 meta 字段里产出 tags 或类别标签的其他算子。除此之外的其他的算子会在分析过程中被忽略。我们使用以下两种注册器来装饰相关的算子：
+  * `NON_STATS_FILTERS`：装饰那些**不能**产出任何统计信息的 Filter 算子。
+  * `TAGGING_OPS`：装饰那些能在 meta 字段中产出 tags 或类别标签的算子。
 
 ### 数据可视化
 

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -79,9 +79,9 @@ process:
   - clean_copyright_mapper:                                 # remove copyright comments.
   - expand_macro_mapper:                                    # expand macro definitions in Latex text.
   - extract_entity_attribute_mapper:                        # Extract attributes for given entities from the text.
+      api_model: 'gpt-4o'                                     # API model name.
       query_entities: ["孙悟空", "猪八戒"]                      # Entity list to be queried.
       query_attributes: ["人物性格"]                            # Attribute list to be queried.
-      api_model: 'gpt-4o'                                     # API model name.
       entity_key: '__dj__entity__'                            # The field name to store the given main entity for attribute extraction.
       entity_attribute_key: '__dj__attribute__'               # The field name to store the given attribute to be extracted.
       attribute_desc_key: '__dj__attribute_description__'     # The field name to store the extracted attribute description.
@@ -153,6 +153,18 @@ process:
       drop_text: false                                        # If drop the text in the output.
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - extract_support_text_mapper:                            # extract support sub text for a summary.
+      api_model: 'gpt-4o'                                     # API model name.
+      summary_key: '__dj__event_description__'                # The field name to store the input summary. Support for nested keys such as "__dj__stats__.text_len".
+      support_text_key: '__dj__support_text__'                # The field name to store the output support text for the summary.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt: null                                     # System prompt for the task.
+      input_template: null                                    # Template for building the model input.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      drop_text: false                                        # If drop the text in the output.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
   - fix_unicode_mapper:                                     # fix unicode errors in text.
   - generate_qa_from_examples_mapper:                       # mapper to generate question and answer pairs from examples.
       hf_model: 'Qwen/Qwen2.5-7B-Instruct'                    # Model name on huggingface to generate question and answer pairs.
@@ -259,12 +271,27 @@ process:
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call.
   - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
-  - python_python_mapper:                                   # executing Python lambda function defined in a file.
+  - python_file_mapper:                                   # executing Python lambda function defined in a file.
       file_path: ''                                           # The path to the Python file containing the function to be executed.
       function_name: 'process_single'                         # The name of the function defined in the file to be executed.
   - python_lambda_mapper:                                   # executing Python lambda function on data samples.
       lambda_str: ''                                          # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
       batched: False                                          # A boolean indicating whether to process input data in batches.
+  - relation_identity_mapper:                               # identify relation between two entity in the text.
+      api_model: 'gpt-4o'                                     # API model name.
+      source_entity: '孙悟空'                                  # The source entity of the relation to be dentified.
+      target_entity: '猪八戒'                                  # The target entity of the relation to be identified.
+      input_key: null                                         # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
+      output_key: null                                        # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is input_key in default.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt_template: null                            # System prompt template for the task. Need to specify by entity1 and entity2.
+      input_template: null                                    # Template for building the model input.
+      output_pattern_template: null                           # Regular expression template for parsing model output.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      drop_text: false                                        # If drop the text in the output.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
   - remove_bibliography_mapper:                             # remove bibliography from Latex text.
   - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.
       doc_type: tex                                           # comment type you want to remove. Only support 'tex' for now.
@@ -567,7 +594,7 @@ process:
       vertical_flip: false                                    # flip frame image vertically (top to bottom).
       reduce_mode: avg                                        # reduce mode when one text corresponds to multiple videos in a chunk,  must be one of ['avg','max', 'min'].
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
-      mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+      mem_required: '1500MB'                                  # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
   - video_motion_score_filter:                              # Keep samples with video motion scores within a specific range.
       min_score: 0.25                                         # the minimum motion score to keep samples
       max_score: 10000.0                                      # the maximum motion score to keep samples
@@ -693,3 +720,55 @@ process:
       top_ratio:                                              # ratio of selected top samples
       topk:                                                   # number of selected top sample
       reverse: True                                           # determine the sorting rule, if reverse=True, then sort in descending order
+
+# Grouper ops.
+  - naive_grouper:                                          # Group all samples to one batched sample.
+  - key_value_grouper:                                      # Group samples to batched samples according values in given keys.
+      group_by_keys: null                                     # Group samples according values in the keys. Support for nested keys such as "__dj__stats__.text_len". It is [self.text_key] in default.
+
+# Aggregator ops.
+  - entity_attribute_aggregator:                            # Return conclusion of the given entity's attribute from some docs.
+      api_model: 'gpt-4o'                                     # API model name.
+      entity: '孙悟空'                                         # The given entity.
+      attribute: '人物经历'                                    # The given attribute.
+      input_key: null                                         # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
+      output_key: null                                        # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
+      word_limit: 100                                         # Prompt the output length.
+      max_token_num: null                                     # The max token num of the total tokens of the sub documents. Without limitation if it is None.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt_template: null                            # System prompt template for the task. Need to be specified by given entity and attribute.
+      example_prompt: null                                    # The example part in the system prompt.
+      input_template: null                                    # The input template.
+      output_pattern_template: null                           # The output template.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - most_relavant_entities_aggregator:                      # Extract entities closely related to a given entity from some texts, and sort them in descending order of importance.
+      api_model: 'gpt-4o'                                     # API model name.
+      entity: '孙悟空'                                         # The given entity.
+      query_entity_type: '人物'                                # The type of queried relavant entities.
+      input_key: null                                         # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
+      output_key: null                                        # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
+      max_token_num: null                                     # The max token num of the total tokens of the sub documents. Without limitation if it is None.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt_template: null                            # System prompt template for the task. Need to be specified by given entity and entity_type.
+      input_template: null                                    # The input template.
+      output_pattern: null                                    # The output pattern.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - nested_aggregator:                                      # Considering the limitation of input length, nested aggregate contents for each given number of samples.
+      api_model: 'gpt-4o'                                     # API model name.
+      input_key: null                                         # The input field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is text_key in default.
+      output_key: null                                        # The output field key in the samples. Support for nested keys such as "__dj__stats__.text_len". It is same as the input_key in default.
+      max_token_num: null                                     # The max token num of the total tokens of the sub documents. Without limitation if it is None.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      system_prompt: null                                     # The system prompt.
+      sub_doc_template: null                                  # The template for input text in each sample.
+      input_template: null                                    # The input template.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or output parsing error.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
diff --git a/data_juicer/__init__.py b/data_juicer/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.0.1'
+__version__ = '1.0.2'
 
 import os
 import subprocess