From 1554138a78e92db882bb07fe68289c4d05f099c2 Mon Sep 17 00:00:00 2001 From: Yilun Huang Date: Thu, 26 Dec 2024 14:39:01 +0800 Subject: [PATCH] Format conversion tools for post tuning datasets (#514) * + add sharegpt <--> dj format conversion tools * - move multimodal into fmt_conversion * + add basic docs for format conversion tools and post tuning dialog format conversion tools * * rename tools * + add messages <--> dj conversion tools * + add messages <--> dj conversion tools * - reorganize the directory * * rename functions * + add conversion tools for ModelScope-Swift ShareGPT format * + add conversion tools for Alpaca format * * fix typos in doc strings * Update post_tuning_dialog/README.md * Update pos_tuning_dialog/README_ZH.md align with en version * clearly point out the DJ format * clearly point out the DJ format in zh * minor typo fix --------- Co-authored-by: Daoyuan Chen <67475544+yxdyc@users.noreply.github.com> --- README.md | 2 +- README_ZH.md | 2 +- tools/fmt_conversion/README.md | 54 +++++ tools/fmt_conversion/README_ZH.md | 54 +++++ .../{ => fmt_conversion}/multimodal/README.md | 4 +- .../multimodal/README_ZH.md | 4 +- .../absolute_path_to_relative_path.py | 0 .../dj_to_internvid.py | 2 +- .../dj_to_llava.py | 0 .../dj_to_mmc4.py | 0 .../dj_to_msrvtt.py | 2 +- .../dj_to_video_chatgpt.py | 2 +- .../dj_to_wavcaps.py | 0 .../dj_to_youku.py | 2 +- .../internvid_to_dj.py | 4 +- .../llava_to_dj.py | 0 .../mmc4_to_dj.py | 0 .../msrvtt_to_dj.py | 4 +- .../video_chatgpt_to_dj.py | 4 +- .../wavcaps_to_dj.py | 0 .../youku_to_dj.py | 4 +- .../{ => fmt_conversion}/multimodal/utils.py | 0 .../post_tuning_dialog/README.md | 96 ++++++++ .../post_tuning_dialog/README_ZH.md | 98 ++++++++ .../dj_to_alpaca.py | 110 +++++++++ .../dj_to_llama_factory_sharegpt.py | 185 +++++++++++++++ .../dj_to_messages.py | 110 +++++++++ .../dj_to_ms_swift_sharegpt.py | 143 ++++++++++++ .../alpaca_to_dj.py | 130 +++++++++++ .../llama_factory_sharegpt_to_dj.py | 216 ++++++++++++++++++ .../messages_to_dj.py | 108 +++++++++ .../ms_swift_sharegpt_to_dj.py | 168 ++++++++++++++ 32 files changed, 1490 insertions(+), 18 deletions(-) create mode 100644 tools/fmt_conversion/README.md create mode 100644 tools/fmt_conversion/README_ZH.md rename tools/{ => fmt_conversion}/multimodal/README.md (99%) rename tools/{ => fmt_conversion}/multimodal/README_ZH.md (99%) rename tools/{ => fmt_conversion}/multimodal/absolute_path_to_relative_path.py (100%) rename tools/{ => fmt_conversion}/multimodal/data_juicer_format_to_target_format/dj_to_internvid.py (98%) rename tools/{ => fmt_conversion}/multimodal/data_juicer_format_to_target_format/dj_to_llava.py (100%) rename tools/{ => fmt_conversion}/multimodal/data_juicer_format_to_target_format/dj_to_mmc4.py (100%) rename tools/{ => fmt_conversion}/multimodal/data_juicer_format_to_target_format/dj_to_msrvtt.py (98%) rename tools/{ => fmt_conversion}/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py (98%) rename tools/{ => fmt_conversion}/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py (100%) rename tools/{ => fmt_conversion}/multimodal/data_juicer_format_to_target_format/dj_to_youku.py (99%) rename tools/{ => fmt_conversion}/multimodal/source_format_to_data_juicer_format/internvid_to_dj.py (97%) rename tools/{ => fmt_conversion}/multimodal/source_format_to_data_juicer_format/llava_to_dj.py (100%) rename tools/{ => fmt_conversion}/multimodal/source_format_to_data_juicer_format/mmc4_to_dj.py (100%) rename tools/{ => fmt_conversion}/multimodal/source_format_to_data_juicer_format/msrvtt_to_dj.py (96%) rename tools/{ => fmt_conversion}/multimodal/source_format_to_data_juicer_format/video_chatgpt_to_dj.py (97%) rename tools/{ => fmt_conversion}/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py (100%) rename tools/{ => fmt_conversion}/multimodal/source_format_to_data_juicer_format/youku_to_dj.py (97%) rename tools/{ => fmt_conversion}/multimodal/utils.py (100%) create mode 100644 tools/fmt_conversion/post_tuning_dialog/README.md create mode 100644 tools/fmt_conversion/post_tuning_dialog/README_ZH.md create mode 100644 tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_messages.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/llama_factory_sharegpt_to_dj.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/messages_to_dj.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py diff --git a/README.md b/README.md index 586869b0a..95eba1da2 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ In this new version, we support more features for **multimodal data (including v - [2024-02-05] Our paper has been accepted by SIGMOD'24 industrial track! - [2024-01-10] Discover new horizons in "Data Mixture"—Our second data-centric LLM competition has kicked off! Please visit the competition's [official website](https://tianchi.aliyun.com/competition/entrance/532174) for more information. - [2024-01-05] We release **Data-Juicer v0.1.3** now! -In this new version, we support **more Python versions** (3.8-3.10), and support **multimodal** dataset [converting](tools/multimodal/README.md)/[processing](docs/Operators.md) (Including texts, images, and audios. More modalities will be supported in the future). +In this new version, we support **more Python versions** (3.8-3.10), and support **multimodal** dataset [converting](tools/fmt_conversion/multimodal/README.md)/[processing](docs/Operators.md) (Including texts, images, and audios. More modalities will be supported in the future). Besides, our paper is also updated to [v3](https://arxiv.org/abs/2309.02033). - [2023-10-13] Our first data-centric LLM competition begins! Please visit the competition's official websites, FT-Data Ranker ([1B Track](https://tianchi.aliyun.com/competition/entrance/532157), [7B Track](https://tianchi.aliyun.com/competition/entrance/532158)), for more information. diff --git a/README_ZH.md b/README_ZH.md index 42612964a..6ba358b37 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -47,7 +47,7 @@ Data-Juicer正在积极更新和维护中,我们将定期强化和新增更多 - [2024-02-05] 我们的论文被SIGMOD'24 industrial track接收! - [2024-01-10] 开启“数据混合”新视界——第二届Data-Juicer大模型数据挑战赛已经正式启动!立即访问[竞赛官网](https://tianchi.aliyun.com/competition/entrance/532174),了解赛事详情。 - [2024-01-05] **Data-Juicer v0.1.3** 版本发布了。 -在这个新版本中,我们支持了**更多Python版本**(3.8-3.10),同时支持了**多模态**数据集的[转换](tools/multimodal/README_ZH.md)和[处理](docs/Operators_ZH.md)(包括文本、图像和音频。更多模态也将会在之后支持)! +在这个新版本中,我们支持了**更多Python版本**(3.8-3.10),同时支持了**多模态**数据集的[转换](tools/fmt_conversion/multimodal/README_ZH.md)和[处理](docs/Operators_ZH.md)(包括文本、图像和音频。更多模态也将会在之后支持)! 此外,我们的论文也更新到了[第三版](https://arxiv.org/abs/2309.02033) 。 - [2023-10-13] 我们的第一届以数据为中心的 LLM 竞赛开始了! 请访问大赛官网,FT-Data Ranker([1B赛道](https://tianchi.aliyun.com/competition/entrance/532157) 、[7B赛道](https://tianchi.aliyun.com/competition/entrance/532158) ) ,了解更多信息。 diff --git a/tools/fmt_conversion/README.md b/tools/fmt_conversion/README.md new file mode 100644 index 000000000..38629ef35 --- /dev/null +++ b/tools/fmt_conversion/README.md @@ -0,0 +1,54 @@ +# Format Conversion Tools + +Here Data-Juicer provides tens of format conversion tools for diverse datasets, including multimodal datasets, post tuning datasets, and so on. +These tools help to convert the dataset in the original format to a unified, intermediate format used in Data-Juicer, which we call it "DJ format". +An overview of DJ format is shown below: + +```python +{ + // >>> core contents: texts, dialogs, ... + "text": "xxx", + "query": "xxx", + "response": "xxx", + ...... + // <<< core contents + + // >>> extra data contents: multimodal data paths, ... + "images": [ + "path/to/the/image/of/antarctica_snowfield", + "path/to/the/image/of/antarctica_map", + "path/to/the/image/of/europe_map" + ], + "audios": [ + "path/to/the/audio/of/sound_of_waves_in_Antarctic_Ocean" + ], + "videos": [ + "path/to/the/video/of/remote_sensing_view_of_antarctica" + ], + // <<< extra data contents + + // >>> meta infos and stats, which could be primitive or produced by Data-Juicer + "meta": { + "src": "customized", + "version": "0.1", + "author": "xxx" + }, + "stats": { + "lang": "en", + "image_widths": [224, 336, 512], + ... + }, + // <<< meta infos and stats +} +``` + +There are about three parts in DJ format: +1. Core contents: such as texts in the pretraining dataset of LLMs, dialogs in the post tuning dataset, and so on. They are directly related to the training or fine-tuning procedures in the downstream usage of the dataset. +2. Extra data contents: such as the paths to the multimodal data in the multimodal datasets. They are organized as path lists. +3. Meta infos & Stats: such as version or source information of the dataset that are inherent from the original dataset, or category tags and stats produced by OPs of Data-Juicer. + +The 2nd and 3rd parts of them are common used and organized in nearly the same structures for diverse datasets. +As a contrast, the 1st part, which is the core contents, might be quite different for different kinds of datasets. +Here are the corresponding documents for different datasets that introduce more details about this part: +- [Multimodal datasets](multimodal/README.md) +- [Post Tuning](post_tuning_dialog/README.md) \ No newline at end of file diff --git a/tools/fmt_conversion/README_ZH.md b/tools/fmt_conversion/README_ZH.md new file mode 100644 index 000000000..5ab13fc9c --- /dev/null +++ b/tools/fmt_conversion/README_ZH.md @@ -0,0 +1,54 @@ +# 格式转换工具 + +在这里,Data-Juicer 为各式各样的数据集提供了十数种格式转换工具,包括多模态数据集,后微调数据集等等。 +这些工具帮助我们将原始格式的数据集转换为 Data-Juicer 使用的一种统一的、中间的格式表示,我们将其称为"DJ 格式"。 +DJ 格式的一个示例如下所示: + +```python +{ + // >>> 核心内容:文本,对话,...... + "text": "xxx", + "query": "xxx", + "response": "xxx", + ...... + // <<< 核心内容 + + // >>> 额外数据内容:多模态数据路径,...... + "images": [ + "path/to/the/image/of/antarctica_snowfield", + "path/to/the/image/of/antarctica_map", + "path/to/the/image/of/europe_map" + ], + "audios": [ + "path/to/the/audio/of/sound_of_waves_in_Antarctic_Ocean" + ], + "videos": [ + "path/to/the/video/of/remote_sensing_view_of_antarctica" + ], + // <<< 额外数据内容 + + // >>> meta 信息和 stats,它们可能是数据集原生的,也可以由 Data-Juicer 产出 + "meta": { + "src": "customized", + "version": "0.1", + "author": "xxx" + }, + "stats": { + "lang": "en", + "image_widths": [224, 336, 512], + ... + }, + // <<< meta 信息和 stats +} +``` + +在 DJ 格式中大概包括三个部分: +1. 核心内容:例如 LLM 的预训练数据集中的文本内容,后微调数据集中的对话内容等。它们与数据集的下游使用的训练或者微调过程直接相关。 +2. 额外数据内容:例如多模态数据集中的多模态数据路径。它们被组织为路径列表。 +3. Meta 信息和 Stats:例如从原始数据集中继承而来的数据集版本或来源信息,或者由 Data-Juicer 的算子产出的类别 tags 和 stats 信息。 + +其中,第 2 和第 3 部分对于不同的数据集来说是通用的,而且都会被组织为几乎相同的结构。 +作为对比,第 1 部分,也就是核心内容部分,对于各种数据集来说可能非常不同。 +这里列举了针对不同种类数据集介绍这个部分更多细节的对应的文档: +- [多模态数据集](multimodal/README_ZH.md) +- [后微调数据集](post_tuning_dialog/README_ZH.md) \ No newline at end of file diff --git a/tools/multimodal/README.md b/tools/fmt_conversion/multimodal/README.md similarity index 99% rename from tools/multimodal/README.md rename to tools/fmt_conversion/multimodal/README.md index 60ff084b8..a4a15aac6 100644 --- a/tools/multimodal/README.md +++ b/tools/fmt_conversion/multimodal/README.md @@ -10,7 +10,7 @@ Both input and output of this utility conform to Data-Juicer's data format. If y To learn more about the usage of the absolute to relative path conversion tool, you can execute the following command: ```shell -python tools/multimodal/absolute_path_to_relative_path.py --help +python tools/fmt_conversion/multimodal/absolute_path_to_relative_path.py --help ``` ## Dataset Format Conversion @@ -94,7 +94,7 @@ For all tools, you can run the following command to find out the usage of them: ```shell # e.g. llava_to_dj.py -python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --help +python tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --help ``` Before using these tools, you might need to take a glance at the reference diff --git a/tools/multimodal/README_ZH.md b/tools/fmt_conversion/multimodal/README_ZH.md similarity index 99% rename from tools/multimodal/README_ZH.md rename to tools/fmt_conversion/multimodal/README_ZH.md index 07afd10cb..3d28633a4 100644 --- a/tools/multimodal/README_ZH.md +++ b/tools/fmt_conversion/multimodal/README_ZH.md @@ -10,7 +10,7 @@ 可以运行以下命令来了解绝对路径转化相对路径工具的详细用法: ```shell -python tools/multimodal/absolute_path_to_relative_path.py --help +python tools/fmt_conversion/multimodal/absolute_path_to_relative_path.py --help ``` ## 数据集格式转换 @@ -86,7 +86,7 @@ python tools/multimodal/absolute_path_to_relative_path.py --help ```shell # 例如:llava_to_dj.py -python tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --help +python tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/llava_to_dj.py --help ``` 在使用这些工具之前,您可能需要查看上表中每个格式的参考资料,以更好地了解详细的格式信息,并理解每个工具的参数含义。 diff --git a/tools/multimodal/absolute_path_to_relative_path.py b/tools/fmt_conversion/multimodal/absolute_path_to_relative_path.py similarity index 100% rename from tools/multimodal/absolute_path_to_relative_path.py rename to tools/fmt_conversion/multimodal/absolute_path_to_relative_path.py diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_internvid.py b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_internvid.py similarity index 98% rename from tools/multimodal/data_juicer_format_to_target_format/dj_to_internvid.py rename to tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_internvid.py index 4c46f4676..434c31879 100644 --- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_internvid.py +++ b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_internvid.py @@ -35,7 +35,7 @@ from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens -from tools.multimodal.utils import remove_dj_special_tokens +from tools.fmt_conversion.multimodal.utils import remove_dj_special_tokens def main( diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_llava.py similarity index 100% rename from tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py rename to tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_llava.py diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_mmc4.py b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_mmc4.py similarity index 100% rename from tools/multimodal/data_juicer_format_to_target_format/dj_to_mmc4.py rename to tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_mmc4.py diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_msrvtt.py b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_msrvtt.py similarity index 98% rename from tools/multimodal/data_juicer_format_to_target_format/dj_to_msrvtt.py rename to tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_msrvtt.py index 4e3e85e32..5cc8c0817 100644 --- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_msrvtt.py +++ b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_msrvtt.py @@ -44,7 +44,7 @@ from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens -from tools.multimodal.utils import remove_dj_special_tokens +from tools.fmt_conversion.multimodal.utils import remove_dj_special_tokens def main( diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py similarity index 98% rename from tools/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py rename to tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py index aa3771c4c..18f1206db 100644 --- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py +++ b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py @@ -38,7 +38,7 @@ from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens -from tools.multimodal.utils import remove_dj_special_tokens +from tools.fmt_conversion.multimodal.utils import remove_dj_special_tokens def main( diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py similarity index 100% rename from tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py rename to tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_youku.py b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_youku.py similarity index 99% rename from tools/multimodal/data_juicer_format_to_target_format/dj_to_youku.py rename to tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_youku.py index e3cb9671c..6b4831b52 100644 --- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_youku.py +++ b/tools/fmt_conversion/multimodal/data_juicer_format_to_target_format/dj_to_youku.py @@ -59,7 +59,7 @@ from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens -from tools.multimodal.utils import remove_dj_special_tokens +from tools.fmt_conversion.multimodal.utils import remove_dj_special_tokens def main( diff --git a/tools/multimodal/source_format_to_data_juicer_format/internvid_to_dj.py b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/internvid_to_dj.py similarity index 97% rename from tools/multimodal/source_format_to_data_juicer_format/internvid_to_dj.py rename to tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/internvid_to_dj.py index 1b2ee2caa..5e52e5b02 100644 --- a/tools/multimodal/source_format_to_data_juicer_format/internvid_to_dj.py +++ b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/internvid_to_dj.py @@ -42,8 +42,8 @@ from data_juicer.utils.file_utils import add_suffix_to_filename from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds, timecode_string_to_seconds) -from tools.multimodal.utils import (check_args_load_to_dj_data, - convert_text_to_dj) +from tools.fmt_conversion.multimodal.utils import (check_args_load_to_dj_data, + convert_text_to_dj) def main( diff --git a/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/llava_to_dj.py similarity index 100% rename from tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py rename to tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/llava_to_dj.py diff --git a/tools/multimodal/source_format_to_data_juicer_format/mmc4_to_dj.py b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/mmc4_to_dj.py similarity index 100% rename from tools/multimodal/source_format_to_data_juicer_format/mmc4_to_dj.py rename to tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/mmc4_to_dj.py diff --git a/tools/multimodal/source_format_to_data_juicer_format/msrvtt_to_dj.py b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/msrvtt_to_dj.py similarity index 96% rename from tools/multimodal/source_format_to_data_juicer_format/msrvtt_to_dj.py rename to tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/msrvtt_to_dj.py index b42d8e608..0bc25f140 100644 --- a/tools/multimodal/source_format_to_data_juicer_format/msrvtt_to_dj.py +++ b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/msrvtt_to_dj.py @@ -43,8 +43,8 @@ from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens -from tools.multimodal.utils import (check_args_load_to_dj_data, - convert_text_to_dj) +from tools.fmt_conversion.multimodal.utils import (check_args_load_to_dj_data, + convert_text_to_dj) def main( diff --git a/tools/multimodal/source_format_to_data_juicer_format/video_chatgpt_to_dj.py b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/video_chatgpt_to_dj.py similarity index 97% rename from tools/multimodal/source_format_to_data_juicer_format/video_chatgpt_to_dj.py rename to tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/video_chatgpt_to_dj.py index d05d64fc5..36f0e6473 100644 --- a/tools/multimodal/source_format_to_data_juicer_format/video_chatgpt_to_dj.py +++ b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/video_chatgpt_to_dj.py @@ -37,8 +37,8 @@ from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens -from tools.multimodal.utils import (check_args_load_to_dj_data, - convert_text_to_dj) +from tools.fmt_conversion.multimodal.utils import (check_args_load_to_dj_data, + convert_text_to_dj) @logger.catch(reraise=True) diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py similarity index 100% rename from tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py rename to tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py diff --git a/tools/multimodal/source_format_to_data_juicer_format/youku_to_dj.py b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/youku_to_dj.py similarity index 97% rename from tools/multimodal/source_format_to_data_juicer_format/youku_to_dj.py rename to tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/youku_to_dj.py index 092a03958..15a570c55 100644 --- a/tools/multimodal/source_format_to_data_juicer_format/youku_to_dj.py +++ b/tools/fmt_conversion/multimodal/source_format_to_data_juicer_format/youku_to_dj.py @@ -58,8 +58,8 @@ from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens -from tools.multimodal.utils import (check_args_load_to_dj_data, - convert_text_to_dj) +from tools.fmt_conversion.multimodal.utils import (check_args_load_to_dj_data, + convert_text_to_dj) @logger.catch(reraise=True) diff --git a/tools/multimodal/utils.py b/tools/fmt_conversion/multimodal/utils.py similarity index 100% rename from tools/multimodal/utils.py rename to tools/fmt_conversion/multimodal/utils.py diff --git a/tools/fmt_conversion/post_tuning_dialog/README.md b/tools/fmt_conversion/post_tuning_dialog/README.md new file mode 100644 index 000000000..4d06a496f --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/README.md @@ -0,0 +1,96 @@ +# Post Tuning Tools + +For post tuning formats, we mainly consider 4 formats to support [ModelScope-Swift](https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md) and [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md). + +- Swift's Messages format (Very similar to the LLaMA-Factory's ShareGPT format, with different key names): + +```python +{ + "messages": [ + { + "role": "system", + "content": "" + }, + { + "role": "user", + "content": "" + }, + { + "role": "assistant", + "content": "" + }, + { + "role": "user", + "content": "" + }, + { + "role": "assistant", + "content": "" + } + ] +} +``` + +- Swift's ShareGPT format: + +```python +{ + "system": "", + "conversation": [ + { + "human": "", + "assistant": "" + }, + { + "human": "", + "assistant": "" + } + ] +} +``` + +- Alpaca format (used in the same definition in Swift and LLaMA-Factory): + +```python +{ + "system": "", + "instruction": "", + "input": "", + "output": "" +} +``` + +- Swift's Query-Response format: + +```python +{ + "system": "", + "query": "", + "response": "", + "history": [ + [ + "", + "" + ] + ] +} +``` + +In Data-Juicer, we pre-set fields to align with the last two formats (Alpaca and Query-Response), which serves as our intermediate format for post-tuning dialog datasets. Correspondingly, we provide several tools to convert datasets in other formats to the following DJ format and vice versa. + +- DJ default format for post-tuning OPs: + +```python +{ + "system": "", + "instruction": "", + "query": "", + "response": "", + "history": [ + [ + "", + "" + ] + ] +} +``` diff --git a/tools/fmt_conversion/post_tuning_dialog/README_ZH.md b/tools/fmt_conversion/post_tuning_dialog/README_ZH.md new file mode 100644 index 000000000..ad73caba6 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/README_ZH.md @@ -0,0 +1,98 @@ +# 后微调工具 + +对于 后微调 数据格式,我们主要考虑 4 种格式来覆盖支持 [ModelScope-Swift](https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md) 和 [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md) : + +- Swift的 Messages 格式(与LLaMA-Factory的 ShareGPT 格式几乎一致,采用了略微不同的key字段命名): + +```python +{ + "messages": [ + { + "role": "system", + "content": "" + }, + { + "role": "user", + "content": "" + }, + { + "role": "assistant", + "content": "" + }, + { + "role": "user", + "content": "" + }, + { + "role": "assistant", + "content": "" + } + ] +} +``` + +- Swift的 ShareGPT 格式: + +```python +{ + "system": "", + "conversation": [ + { + "human": "", + "assistant": "" + }, + { + "human": "", + "assistant": "" + } + ] +} +``` + +- Alpaca 格式 (在Swift和LLaMA-Factory中定义一致): + +```python +{ + "system": "", + "instruction": "", + "input": "", + "output": "" +} +``` + +- Swift的Query-Response 格式: + +```python +{ + "system": "", + "query": "", + "response": "", + "history": [ + [ + "", + "" + ] + ] +} +``` + +在 Data-Juicer 中,我们预设了一些字段来对齐最后两种格式(Alpaca和Query-Response),并将如下格式作为 后微调对话 数据集的统一中间表示。 +相应地,我们提供了若干内置工具将其他格式的数据集转换为 DJ 格式以及反向转换。 + + +- DJ的多轮对话缺省格式(DJ post-tuning算子实现时假设基于该格式进行字段解析和处理): + +```python +{ + "system": "", + "instruction": "", + "query": "", + "response": "", + "history": [ + [ + "", + "" + ] + ] +} +``` diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py new file mode 100644 index 000000000..f79fd0c43 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py @@ -0,0 +1,110 @@ +# This tool is used to convert dataset in Data-Juicer format to a +# target dataset in Alpaca-like format. +# +# Data-Juicer format (query-response format): +# [ +# { +# "system": "", +# "instruction": "", +# "query": "", +# "response": "", +# "history": [ +# ["human instruction in the first round (optional)", "model response in the first round (optional)"], # noqa: E501 +# ["human instruction in the second round (optional)", "model response in the second round (optional)"] # noqa: E501 +# ], +# }, +# ... +# ] +# +# Corresponding Alpaca format: +# [ +# { +# "system": "", +# "instruction": "", +# "input": "", +# "output": "", +# "history": [ +# ["human instruction in the first round (optional)", "model response in the first round (optional)"], # noqa: E501 +# ["human instruction in the second round (optional)", "model response in the second round (optional)"] # noqa: E501 +# ], +# }, +# ...... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md +# https://github.com/hiyouga/LLaMA-Factory/blob/v0.9.1/data/README.md#alpaca-format + +import json +import os + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def dj_to_alpaca( + sample, + input_key: str = 'input', + output_key: str = 'output', +): + modified_keys = {'query', 'response'} + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys and sample[key] + } + + # key mapping + if 'query' in sample: + new_sample[input_key] = sample['query'] + if 'response' in sample: + new_sample[output_key] = sample['response'] + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + input_key: str = 'input', + output_key: str = 'output', +): + """ + Convert a Data-Juicer dataset to the Alpaca-like format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param input_key: the field key to store the query sentence from human. + :param output_key: the field key to store the response sentence from + assistant. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.json'): + raise ValueError('Only support "json" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + samples = [] + with jl.open(src_ds_path, 'r') as reader: + for sample in tqdm(reader): + converted_sample = dj_to_alpaca(sample, + input_key=input_key, + output_key=output_key) + samples.append(converted_sample) + + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + json.dump(samples, open(tgt_ds_path, 'w', encoding='utf-8')) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py new file mode 100644 index 000000000..c72dcbb84 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py @@ -0,0 +1,185 @@ +# This tool is used to convert dataset in Data-Juicer format to a +# target dataset in LLaMA-Factory ShareGPT-like format. +# +# Data-Juicer format (query-response format): +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "query": "Is the bus driving down the street or pulled off to the side?", +# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# "history": [ +# [ +# "\nWhat are the colors of the bus in the image?", +# "The bus in the image is white and red." +# ], +# [ +# "What feature can be seen on the back of the bus?", +# "The back of the bus features an advertisement." +# ], +# ] +# }, +# ... +# ] +# +# Corresponding LLaMA-Factory ShareGPT format: +# - usually in json format +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "conversations": [ +# { +# "from": "human", +# "value": "\nWhat are the colors of the bus in the image?" +# }, +# { +# "from": "gpt", +# "value": "The bus in the image is white and red." +# }, +# { +# "from": "human", +# "value": "What feature can be seen on the back of the bus?" +# }, +# { +# "from": "gpt", +# "value": "The back of the bus features an advertisement." +# }, +# { +# "from": "human", +# "value": "Is the bus driving down the street or pulled off to the side?" # noqa: E501 +# }, +# { +# "from": "gpt", +# "value": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# } +# ] +# }, +# ... +# ] +# +# Reference: +# https://github.com/hiyouga/LLaMA-Factory/blob/v0.9.1/data/README.md#sharegpt-format + +import json +import os + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def dj_to_llama_factory_sharegpt( + sample, + conversations_key: str = 'conversations', + from_key: str = 'from', + value_key: str = 'value', + human_role: str = 'user', + assistant_role: str = 'assistant', + system_role: str = 'system', + instruction_role: str = 'instruction', +): + modified_keys = {'query', 'response', 'history', 'system', 'instruction'} + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys and sample[key] + } + + # construct conversations + conversations = [] + # add system prompt and instruction + if 'system' in sample and sample['system'] != '': + conversations.append({ + from_key: system_role, + value_key: sample['system'] + }) + if 'instruction' in sample and sample['instruction'] != '': + conversations.append({ + from_key: instruction_role, + value_key: sample['instruction'] + }) + + # add dialogs + for query, response in sample['history']: + conversations.append({ + from_key: human_role, + value_key: query, + }) + conversations.append({ + from_key: assistant_role, + value_key: response, + }) + conversations.append({ + from_key: human_role, + value_key: sample['query'], + }) + if 'response' in sample and sample['response'] != '': + conversations.append({ + from_key: assistant_role, + value_key: sample['response'], + }) + + # get the result sample + new_sample[conversations_key] = conversations + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + conversations_key: str = 'conversations', + from_key: str = 'from', + value_key: str = 'value', + human_role: str = 'user', + assistant_role: str = 'assistant', + system_role: str = 'system', + instruction_role: str = 'instruction', +): + """ + Convert a Data-Juicer dataset to the LLaMA-Factory ShareGPT-like format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param conversations_key: the field key to store conversions. + :param from_key: the field key to store the sentence from. + :param value_key: the field key to store the sentence content. + :param human_role: the role to store the human prompt. + :param assistant_role: the role to store the instruction content. + :param system_role: the role to store the system prompt. + :param instruction_role: the role to store the instruction content. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.json'): + raise ValueError('Only support "json" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + samples = [] + with jl.open(src_ds_path, 'r') as reader: + for sample in tqdm(reader): + converted_sample = dj_to_llama_factory_sharegpt( + sample, + conversations_key=conversations_key, + from_key=from_key, + value_key=value_key, + human_role=human_role, + assistant_role=assistant_role, + system_role=system_role, + instruction_role=instruction_role) + samples.append(converted_sample) + + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + json.dump(samples, open(tgt_ds_path, 'w', encoding='utf-8')) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_messages.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_messages.py new file mode 100644 index 000000000..af52b2c87 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_messages.py @@ -0,0 +1,110 @@ +# This tool is used to convert dataset in Data-Juicer format to a +# target dataset in ModelScope-Swift Messages-like format. +# +# Data-Juicer format (query-response format): +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "query": "Is the bus driving down the street or pulled off to the side?", +# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# "history": [ +# [ +# "\nWhat are the colors of the bus in the image?", +# "The bus in the image is white and red." +# ], +# [ +# "What feature can be seen on the back of the bus?", +# "The back of the bus features an advertisement." +# ], +# ] +# }, +# ... +# ] +# +# Corresponding ModelScope-Swift Messages format: +# - usually in json format +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "messages": [ +# { +# "role": "human", +# "content": "\nWhat are the colors of the bus in the image?" +# }, +# { +# "role": "gpt", +# "content": "The bus in the image is white and red." +# }, +# { +# "role": "human", +# "content": "What feature can be seen on the back of the bus?" +# }, +# { +# "role": "gpt", +# "content": "The back of the bus features an advertisement." +# }, +# { +# "role": "human", +# "content": "Is the bus driving down the street or pulled off to the side?" # noqa: E501 +# }, +# { +# "role": "gpt", +# "content": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# } +# ] +# }, +# ... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md +# +# This format is nearly the same as the LLaMA-Factory ShareGPT format, so we +# reuse the code in that conversion tools. + +import dj_to_llama_factory_sharegpt +import fire +from loguru import logger + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + messages_key: str = 'messages', + role_key: str = 'role', + content_key: str = 'content', + human_role: str = 'user', + assistant_role: str = 'assistant', + system_role: str = 'system', + instruction_role: str = 'instruction', +): + """ + Convert a Data-Juicer query-response dataset to the ModelScope-Swift + Message format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param messages_key: the field key to store messages. + :param role_key: the field key to store the sentence from. + :param content_key: the field key to store the sentence content. + :param human_role: the role to store the human prompt. + :param assistant_role: the role to store the instruction content. + :param system_role: the role to store the system prompt. + :param instruction_role: the role to store the instruction content. + """ + dj_to_llama_factory_sharegpt.main( + src_ds_path, + tgt_ds_path, + conversations_key=messages_key, + from_key=role_key, + value_key=content_key, + human_role=human_role, + assistant_role=assistant_role, + system_role=system_role, + instruction_role=instruction_role, + ) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py new file mode 100644 index 000000000..d0d6b6b62 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py @@ -0,0 +1,143 @@ +# This tool is used to convert dataset in Data-Juicer format to a +# target dataset in ModelScope-Swift ShareGPT format. +# +# Data-Juicer format (query-response format): +# [ +# { +# "system": "", +# "query": "", +# "response": "" +# "history": [ +# [ +# "", +# "" +# ], +# ] +# }, +# ... +# ] +# +# Corresponding ModelScope-Swift ShareGPT format: +# [ +# { +# "system": "", +# "conversation": [ +# { +# "human": "", +# "assistant": "" +# }, +# { +# "human": "", +# "assistant": "" +# } +# ] +# }, +# ...... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md + +import json +import os + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def dj_to_ms_swift_sharegpt( + sample, + conversation_key: str = 'conversation', + human_key: str = 'human', + assistant_key: str = 'assistant', + system_key: str = 'system', + instruction_key: str = 'instruction', +): + modified_keys = {'query', 'response', 'history', 'system', 'instruction'} + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys + } + + # find system prompt and instruction + if 'system' in sample: + new_sample[system_key] = sample['system'] + if 'instruction' in sample: + new_sample[instruction_key] = sample['instruction'] + + # construct conversation + conversation = [] + # add dialogs + for query, response in sample['history']: + conversation.append({ + human_key: query, + assistant_key: response, + }) + conversation.append({ + human_key: + sample['query'], + assistant_key: + sample['response'] if 'response' in sample else '' + }) + + new_sample[conversation_key] = conversation + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + conversation_key: str = 'conversation', + human_key: str = 'human', + assistant_key: str = 'assistant', + system_key: str = 'system', + instruction_key: str = 'instruction', +): + """ + Convert a Data-Juicer query-response dataset to the ModelScope-Swift + ShareGPT-like format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param conversation_key: the field key to store conversions. + :param human_key: the field key to store the sentence from human. + :param assistant_key: the field key to store the sentence from assistant. + :param system_key: the field key to store the system prompt. + :param instruction_key: the field key to store the instruction content. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.json'): + raise ValueError('Only support "json" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + # load dataset + samples = [] + with jl.open(src_ds_path, 'r') as reader: + for sample in tqdm(reader): + converted_sample = dj_to_ms_swift_sharegpt( + sample, + conversation_key=conversation_key, + human_key=human_key, + assistant_key=assistant_key, + system_key=system_key, + instruction_key=instruction_key) + samples.append(converted_sample) + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + json.dump(samples, open(tgt_ds_path, 'w', encoding='utf-8')) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py new file mode 100644 index 000000000..cdbd64345 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py @@ -0,0 +1,130 @@ +# This tool is used to convert dataset in Alpaca format to a +# target dataset in Data-Juicer query-response format. +# +# Alpaca format: +# [ +# { +# "system": "", +# "instruction": "", +# "input": "", +# "output": "", +# "history": [ +# ["human instruction in the first round (optional)", "model response in the first round (optional)"], # noqa: E501 +# ["human instruction in the second round (optional)", "model response in the second round (optional)"] # noqa: E501 +# ], +# }, +# ...... +# ] +# +# Corresponding Data-Juicer format (query-response format): +# [ +# { +# "system": "", +# "instruction": "", +# "query": "", +# "response": "", +# "history": [ +# ["human instruction in the first round (optional)", "model response in the first round (optional)"], # noqa: E501 +# ["human instruction in the second round (optional)", "model response in the second round (optional)"] # noqa: E501 +# ], +# }, +# ... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md +# https://github.com/hiyouga/LLaMA-Factory/blob/v0.9.1/data/README.md#alpaca-format + +import json +import os +from typing import List, Union + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def alpaca_to_dj( + sample, + input_key: str = 'input', + output_key: str = 'output', + multimodal_keys: Union[str, List[str]] = None, +): + modified_keys = {input_key, output_key} + if multimodal_keys: + modified_keys = modified_keys.union(set(multimodal_keys)) + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys + } + + # key mapping for input and output + if input_key in sample: + new_sample['query'] = sample[input_key] + if output_key in sample: + new_sample['response'] = sample[output_key] + + # update multimodal data + if multimodal_keys: + for mm_key in multimodal_keys: + if not isinstance(sample[mm_key], list): + new_sample[mm_key] = [sample[mm_key]] + else: + new_sample[mm_key] = sample[mm_key] + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + input_key: str = 'input', + output_key: str = 'output', + multimodal_keys: Union[str, List[str]] = None, +): + """ + Convert an Alpaca-like dataset to the Data-Juicer query-response format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param input_key: the field key to store the query sentence from human. + :param output_key: the field key to store the response sentence from + assistant. + :param multimodal_keys: optional keys to store multimodal data. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.jsonl'): + raise ValueError('Only support "jsonl" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + if isinstance(multimodal_keys, str): + multimodal_keys = [multimodal_keys] + + # load Alpaca dataset + logger.info('Loading original dataset.') + src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8')) + logger.info(f'Load [{len(src_ds)}] samples.') + + with jl.open(tgt_ds_path, 'w') as writer: + for sample in tqdm(src_ds): + converted_sample = alpaca_to_dj(sample, + input_key=input_key, + output_key=output_key, + multimodal_keys=multimodal_keys) + writer.write(converted_sample) + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/llama_factory_sharegpt_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/llama_factory_sharegpt_to_dj.py new file mode 100644 index 000000000..2f25ad7c8 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/llama_factory_sharegpt_to_dj.py @@ -0,0 +1,216 @@ +# This tool is used to convert dataset in LLaMA-Factory ShareGPT format to a +# target dataset in Data-Juicer query-response format. +# +# LLaMA-Factory ShareGPT format: +# - usually in json format +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "conversations": [ +# { +# "from": "human", +# "value": "\nWhat are the colors of the bus in the image?" +# }, +# { +# "from": "gpt", +# "value": "The bus in the image is white and red." +# }, +# { +# "from": "human", +# "value": "What feature can be seen on the back of the bus?" +# }, +# { +# "from": "gpt", +# "value": "The back of the bus features an advertisement." +# }, +# { +# "from": "human", +# "value": "Is the bus driving down the street or pulled off to the side?" # noqa: E501 +# }, +# { +# "from": "gpt", +# "value": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# } +# ] +# }, +# ... +# ] +# +# Corresponding Data-Juicer format (query-response format): +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "query": "Is the bus driving down the street or pulled off to the side?", +# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# "history": [ +# [ +# "\nWhat are the colors of the bus in the image?", +# "The bus in the image is white and red." +# ], +# [ +# "What feature can be seen on the back of the bus?", +# "The back of the bus features an advertisement." +# ], +# ] +# }, +# ... +# ] +# +# Reference: +# https://github.com/hiyouga/LLaMA-Factory/blob/v0.9.1/data/README.md#sharegpt-format + +import json +import os +from typing import List, Union + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def llama_factory_sharegpt_to_dj( + sample, + conversations_key: str = 'conversations', + from_key: str = 'from', + value_key: str = 'value', + system_role: str = 'system', + instruction_role: str = 'instruction', + multimodal_keys: Union[str, List[str]] = None, +): + modified_keys = {conversations_key} + if multimodal_keys: + modified_keys = modified_keys.union(set(multimodal_keys)) + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys + } + + # conversations to query, response, history + conversations = sample[conversations_key] + # find system prompt and instruction + system_prompt = '' + instruction = '' + remove_idx = [] + for i, conv in enumerate(conversations): + if conv[from_key] == system_role: + if system_prompt != '': + raise NotImplementedError( + 'DO NOT support more than 1 system prompts in the ' + 'conversation for now.') + system_prompt = conv[value_key] + remove_idx.append(i) + elif conv[from_key] == instruction_role: + if instruction != '': + raise NotImplementedError( + 'DO NOT support more than 1 instructions in the ' + 'conversation for now.') + instruction = conv[value_key] + remove_idx.append(i) + if len(remove_idx) > 0: + for i in remove_idx: + conversations.pop(i) + + # reconstruct conversations + conv_num = len(conversations) + if conv_num == 0: + query = '' + response = '' + history = [] + elif conv_num % 2 == 0: + # the last 2 sentences are query and response + query = conversations[-2][value_key] + response = conversations[-1][value_key] + history = [[ + conversations[i][value_key], conversations[i + 1][value_key] + ] for i in range(0, conv_num - 2, 2)] + else: + # the last 1 sentence is query and response is empty + query = conversations[-1][value_key] + response = '' + history = [[ + conversations[i][value_key], conversations[i + 1][value_key] + ] for i in range(0, conv_num - 1, 2)] + + # get the result sample + new_sample.update({ + 'system': system_prompt, + 'instruction': instruction, + 'query': query, + 'response': response, + 'history': history, + }) + + # update multimodal data + if multimodal_keys: + for mm_key in multimodal_keys: + if not isinstance(sample[mm_key], list): + new_sample[mm_key] = [sample[mm_key]] + else: + new_sample[mm_key] = sample[mm_key] + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + conversations_key: str = 'conversations', + from_key: str = 'from', + value_key: str = 'value', + system_role: str = 'system', + instruction_role: str = 'instruction', + multimodal_keys: Union[str, List[str]] = None, +): + """ + Convert a LLaMA-Factory ShareGPT-like dataset to the Data-Juicer + query-response format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param conversations_key: the field key to store conversions. + :param from_key: the field key to store the sentence from. + :param value_key: the field key to store the sentence content. + :param system_role: the field key to store the system prompt. + :param instruction_role: the field key to store the instruction content. + :param multimodal_keys: optional keys to store multimodal data. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.jsonl'): + raise ValueError('Only support "jsonl" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + if isinstance(multimodal_keys, str): + multimodal_keys = [multimodal_keys] + + # load dataset + logger.info('Loading original dataset.') + src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8')) + logger.info(f'Load [{len(src_ds)}] samples.') + + with jl.open(tgt_ds_path, 'w') as writer: + for sample in tqdm(src_ds): + converted_sample = llama_factory_sharegpt_to_dj( + sample, + conversations_key=conversations_key, + from_key=from_key, + value_key=value_key, + system_role=system_role, + instruction_role=instruction_role, + multimodal_keys=multimodal_keys) + writer.write(converted_sample) + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/messages_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/messages_to_dj.py new file mode 100644 index 000000000..1f5e74071 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/messages_to_dj.py @@ -0,0 +1,108 @@ +# This tool is used to convert dataset in ModelScope-Swift Messages format to a +# target dataset in Data-Juicer query-response format. +# +# ModelScope-Swift Messages format: +# - usually in json format +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "messages": [ +# { +# "role": "human", +# "content": "\nWhat are the colors of the bus in the image?" +# }, +# { +# "role": "gpt", +# "content": "The bus in the image is white and red." +# }, +# { +# "role": "human", +# "content": "What feature can be seen on the back of the bus?" +# }, +# { +# "role": "gpt", +# "content": "The back of the bus features an advertisement." +# }, +# { +# "role": "human", +# "content": "Is the bus driving down the street or pulled off to the side?" # noqa: E501 +# }, +# { +# "role": "gpt", +# "content": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# } +# ] +# }, +# ... +# ] +# +# Corresponding Data-Juicer format (query-response format): +# [ +# { +# "images": ["coco/train2017/000000033471.jpg"], +# "query": "Is the bus driving down the street or pulled off to the side?", +# "response": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 +# "history": [ +# [ +# "\nWhat are the colors of the bus in the image?", +# "The bus in the image is white and red." +# ], +# [ +# "What feature can be seen on the back of the bus?", +# "The back of the bus features an advertisement." +# ], +# ] +# }, +# ... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md +# +# This format is nearly the same as the LLaMA-Factory ShareGPT format, so we +# reuse the code in that conversion tools. + +from typing import List, Union + +import fire +import llama_factory_sharegpt_to_dj +from loguru import logger + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + messages_key: str = 'messages', + role_key: str = 'role', + content_key: str = 'content', + system_role: str = 'system', + instruction_role: str = 'instruction', + multimodal_keys: Union[str, List[str]] = None, +): + """ + Convert a Messages-like dataset to the Data-Juicer query-response format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param messages_key: the field key to store messages. + :param role_key: the field key to store the sentence from. + :param content_key: the field key to store the sentence content. + :param system_role: the field key to store the system prompt. + :param instruction_role: the field key to store the instruction content. + :param multimodal_keys: optional keys to store multimodal data. + """ + llama_factory_sharegpt_to_dj.main( + src_ds_path, + tgt_ds_path, + conversations_key=messages_key, + from_key=role_key, + value_key=content_key, + system_role=system_role, + instruction_role=instruction_role, + multimodal_keys=multimodal_keys, + ) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py new file mode 100644 index 000000000..8112c31bb --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py @@ -0,0 +1,168 @@ +# This tool is used to convert dataset in ModelScope-Swift ShareGPT format to a +# target dataset in Data-Juicer query-response format. +# +# ModelScope-Swift ShareGPT format: +# [ +# { +# "system": "", +# "conversation": [ +# { +# "human": "", +# "assistant": "" +# }, +# { +# "human": "", +# "assistant": "" +# } +# ] +# }, +# ...... +# ] +# +# Corresponding Data-Juicer format (query-response format): +# [ +# { +# "system": "", +# "query": "", +# "response": "" +# "history": [ +# [ +# "", +# "" +# ], +# ] +# }, +# ... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md + +import json +import os +from typing import List, Union + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def ms_swift_sharegpt_to_dj( + sample, + conversation_key: str = 'conversation', + human_key: str = 'human', + assistant_key: str = 'assistant', + system_key: str = 'system', + instruction_key: str = 'instruction', + multimodal_keys: Union[str, List[str]] = None, +): + modified_keys = {conversation_key, system_key, instruction_key} + if multimodal_keys: + modified_keys = modified_keys.union(set(multimodal_keys)) + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys + } + + # find system prompt and instruction + if system_key in sample: + new_sample['system'] = sample[system_key] + if instruction_key in sample: + new_sample['instruction'] = sample[instruction_key] + + # conversations to query, response, history + conversation = sample[conversation_key] + # reconstruct conversations + conv_num = len(conversation) + if conv_num == 0: + query = '' + response = '' + history = [] + else: + # the last 1 sentence is query and response is empty + query = conversation[-1][human_key] + response = conversation[-1][assistant_key] + history = [[conv[human_key], conv[assistant_key]] + for conv in conversation[:-1]] + + # get the result sample + new_sample.update({ + 'query': query, + 'response': response, + 'history': history, + }) + + # update multimodal data + if multimodal_keys: + for mm_key in multimodal_keys: + if not isinstance(sample[mm_key], list): + new_sample[mm_key] = [sample[mm_key]] + else: + new_sample[mm_key] = sample[mm_key] + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + conversation_key: str = 'conversation', + human_key: str = 'human', + assistant_key: str = 'assistant', + system_key: str = 'system', + instruction_key: str = 'instruction', + multimodal_keys: Union[str, List[str]] = None, +): + """ + Convert a ModelScope-Swift ShareGPT-like dataset to the Data-Juicer + query-response format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param conversation_key: the field key to store conversions. + :param human_key: the field key to store the sentence from human. + :param assistant_key: the field key to store the sentence from assistant. + :param system_key: the field key to store the system prompt. + :param instruction_key: the field key to store the instruction content. + :param multimodal_keys: optional keys to store multimodal data. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.jsonl'): + raise ValueError('Only support "jsonl" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + if isinstance(multimodal_keys, str): + multimodal_keys = [multimodal_keys] + + # load dataset + logger.info('Loading original dataset.') + src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8')) + logger.info(f'Load [{len(src_ds)}] samples.') + + with jl.open(tgt_ds_path, 'w') as writer: + for sample in tqdm(src_ds): + converted_sample = ms_swift_sharegpt_to_dj( + sample, + conversation_key=conversation_key, + human_key=human_key, + assistant_key=assistant_key, + system_key=system_key, + instruction_key=instruction_key, + multimodal_keys=multimodal_keys) + writer.write(converted_sample) + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + + +if __name__ == '__main__': + fire.Fire(main)