From 128f1ab215af568a3a9e12352b44d39cf9fbc9e8 Mon Sep 17 00:00:00 2001 From: "lielin.hyl" Date: Mon, 23 Dec 2024 20:46:07 +0800 Subject: [PATCH] + add conversion tools for ModelScope-Swift ShareGPT format --- .../dj_to_llama_factory_sharegpt.py | 2 +- .../dj_to_messages.py | 2 +- .../dj_to_ms_swift_sharegpt.py | 143 +++++++++++++++ .../llama_factory_sharegpt_to_dj.py | 2 +- .../messages_to_dj.py | 2 +- .../ms_swift_sharegpt_to_dj.py | 168 ++++++++++++++++++ 6 files changed, 315 insertions(+), 4 deletions(-) create mode 100644 tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py index d53b916ae..8bcc8207f 100644 --- a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py @@ -139,7 +139,7 @@ def main( """ Convert a ShareGPT-like dataset to the Data-Juicer query-response format. - :param src_ds_path: the path to the source ShareGPT-like dataset. + :param src_ds_path: the path to the source dataset. :param tgt_ds_path: the path to store the converted target dataset. :param conversations_key: the field key to store conversions. :param from_key: the field key to store the sentence from. diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_messages.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_messages.py index cf9e2325f..417ff82d8 100644 --- a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_messages.py +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_messages.py @@ -82,7 +82,7 @@ def main( """ Convert a ShareGPT-like dataset to the Data-Juicer query-response format. - :param src_ds_path: the path to the source ShareGPT-like dataset. + :param src_ds_path: the path to the source dataset. :param tgt_ds_path: the path to store the converted target dataset. :param messages_key: the field key to store messages. :param role_key: the field key to store the sentence from. diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py new file mode 100644 index 000000000..d0d6b6b62 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py @@ -0,0 +1,143 @@ +# This tool is used to convert dataset in Data-Juicer format to a +# target dataset in ModelScope-Swift ShareGPT format. +# +# Data-Juicer format (query-response format): +# [ +# { +# "system": "", +# "query": "", +# "response": "" +# "history": [ +# [ +# "", +# "" +# ], +# ] +# }, +# ... +# ] +# +# Corresponding ModelScope-Swift ShareGPT format: +# [ +# { +# "system": "", +# "conversation": [ +# { +# "human": "", +# "assistant": "" +# }, +# { +# "human": "", +# "assistant": "" +# } +# ] +# }, +# ...... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md + +import json +import os + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def dj_to_ms_swift_sharegpt( + sample, + conversation_key: str = 'conversation', + human_key: str = 'human', + assistant_key: str = 'assistant', + system_key: str = 'system', + instruction_key: str = 'instruction', +): + modified_keys = {'query', 'response', 'history', 'system', 'instruction'} + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys + } + + # find system prompt and instruction + if 'system' in sample: + new_sample[system_key] = sample['system'] + if 'instruction' in sample: + new_sample[instruction_key] = sample['instruction'] + + # construct conversation + conversation = [] + # add dialogs + for query, response in sample['history']: + conversation.append({ + human_key: query, + assistant_key: response, + }) + conversation.append({ + human_key: + sample['query'], + assistant_key: + sample['response'] if 'response' in sample else '' + }) + + new_sample[conversation_key] = conversation + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + conversation_key: str = 'conversation', + human_key: str = 'human', + assistant_key: str = 'assistant', + system_key: str = 'system', + instruction_key: str = 'instruction', +): + """ + Convert a Data-Juicer query-response dataset to the ModelScope-Swift + ShareGPT-like format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param conversation_key: the field key to store conversions. + :param human_key: the field key to store the sentence from human. + :param assistant_key: the field key to store the sentence from assistant. + :param system_key: the field key to store the system prompt. + :param instruction_key: the field key to store the instruction content. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.json'): + raise ValueError('Only support "json" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + # load dataset + samples = [] + with jl.open(src_ds_path, 'r') as reader: + for sample in tqdm(reader): + converted_sample = dj_to_ms_swift_sharegpt( + sample, + conversation_key=conversation_key, + human_key=human_key, + assistant_key=assistant_key, + system_key=system_key, + instruction_key=instruction_key) + samples.append(converted_sample) + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + json.dump(samples, open(tgt_ds_path, 'w', encoding='utf-8')) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/llama_factory_sharegpt_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/llama_factory_sharegpt_to_dj.py index f65821126..5715759c0 100644 --- a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/llama_factory_sharegpt_to_dj.py +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/llama_factory_sharegpt_to_dj.py @@ -166,7 +166,7 @@ def main( """ Convert a ShareGPT-like dataset to the Data-Juicer query-response format. - :param src_ds_path: the path to the source ShareGPT-like dataset. + :param src_ds_path: the path to the source dataset. :param tgt_ds_path: the path to store the converted target dataset. :param conversations_key: the field key to store conversions. :param from_key: the field key to store the sentence from. diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/messages_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/messages_to_dj.py index ea4bf40ec..1f5e74071 100644 --- a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/messages_to_dj.py +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/messages_to_dj.py @@ -83,7 +83,7 @@ def main( """ Convert a Messages-like dataset to the Data-Juicer query-response format. - :param src_ds_path: the path to the source ShareGPT-like dataset. + :param src_ds_path: the path to the source dataset. :param tgt_ds_path: the path to store the converted target dataset. :param messages_key: the field key to store messages. :param role_key: the field key to store the sentence from. diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py new file mode 100644 index 000000000..c3364150b --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py @@ -0,0 +1,168 @@ +# This tool is used to convert dataset in ModelScope-Swift ShareGPT format to a +# target dataset in Data-Juicer query-response format. +# +# ModelScope-Swift ShareGPT format: +# [ +# { +# "system": "", +# "conversation": [ +# { +# "human": "", +# "assistant": "" +# }, +# { +# "human": "", +# "assistant": "" +# } +# ] +# }, +# ...... +# ] +# +# Corresponding Data-Juicer format (query-response format): +# [ +# { +# "system": "", +# "query": "", +# "response": "" +# "history": [ +# [ +# "", +# "" +# ], +# ] +# }, +# ... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md + +import json +import os +from typing import List, Union + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def ms_swift_sharegpt_to_dj( + sample, + conversation_key: str = 'conversation', + human_key: str = 'human', + assistant_key: str = 'assistant', + system_key: str = 'system', + instruction_key: str = 'instruction', + multimodal_keys: Union[str, List[str]] = None, +): + modified_keys = {conversation_key, system_key, instruction_key} + if multimodal_keys: + modified_keys = modified_keys.union(set(multimodal_keys)) + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys + } + + # find system prompt and instruction + if system_key in sample: + new_sample['system'] = sample[system_key] + if instruction_key in sample: + new_sample['instruction'] = sample[instruction_key] + + # conversations to query, response, history + conversation = sample[conversation_key] + # reconstruct conversations + conv_num = len(conversation) + if conv_num == 0: + query = '' + response = '' + history = [] + else: + # the last 1 sentence is query and response is empty + query = conversation[-1][human_key] + response = conversation[-1][assistant_key] + history = [[conv[human_key], conv[assistant_key]] + for conv in conversation[:-1]] + + # get the result sample + new_sample.update({ + 'query': query, + 'response': response, + 'history': history, + }) + + # update multimodal data + if multimodal_keys: + for mm_key in multimodal_keys: + if not isinstance(sample[mm_key], list): + new_sample[mm_key] = [sample[mm_key]] + else: + new_sample[mm_key] = sample[mm_key] + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + conversation_key: str = 'conversation', + human_key: str = 'human', + assistant_key: str = 'assistant', + system_key: str = 'system', + instruction_key: str = 'instruction', + multimodal_keys: Union[str, List[str]] = None, +): + """ + Convert a ModelScope-Swift ShareGPT-like dataset to the Data-Juicer + query-response format. + + :param src_ds_path: the path to the source ShareGPT-like dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param conversation_key: the field key to store conversions. + :param human_key: the field key to store the sentence from human. + :param assistant_key: the field key to store the sentence from assistant. + :param system_key: the field key to store the system prompt. + :param instruction_key: the field key to store the instruction content. + :param multimodal_keys: optional keys to store multimodal data. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.jsonl'): + raise ValueError('Only support "jsonl" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + if isinstance(multimodal_keys, str): + multimodal_keys = [multimodal_keys] + + # load ShareGPT dataset + logger.info('Loading original dataset.') + src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8')) + logger.info(f'Load [{len(src_ds)}] samples.') + + with jl.open(tgt_ds_path, 'w') as writer: + for sample in tqdm(src_ds): + converted_sample = ms_swift_sharegpt_to_dj( + sample, + conversation_key=conversation_key, + human_key=human_key, + assistant_key=assistant_key, + system_key=system_key, + instruction_key=instruction_key, + multimodal_keys=multimodal_keys) + writer.write(converted_sample) + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + + +if __name__ == '__main__': + fire.Fire(main)