From 7519a658b2b2fd23b1bf21742072f54f72a9564b Mon Sep 17 00:00:00 2001 From: "lielin.hyl" Date: Mon, 23 Dec 2024 21:03:33 +0800 Subject: [PATCH] + add conversion tools for Alpaca format --- .../dj_to_alpaca.py | 110 +++++++++++++++ .../dj_to_llama_factory_sharegpt.py | 2 +- .../alpaca_to_dj.py | 131 ++++++++++++++++++ .../ms_swift_sharegpt_to_dj.py | 2 +- 4 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py create mode 100644 tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py new file mode 100644 index 000000000..f79fd0c43 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py @@ -0,0 +1,110 @@ +# This tool is used to convert dataset in Data-Juicer format to a +# target dataset in Alpaca-like format. +# +# Data-Juicer format (query-response format): +# [ +# { +# "system": "", +# "instruction": "", +# "query": "", +# "response": "", +# "history": [ +# ["human instruction in the first round (optional)", "model response in the first round (optional)"], # noqa: E501 +# ["human instruction in the second round (optional)", "model response in the second round (optional)"] # noqa: E501 +# ], +# }, +# ... +# ] +# +# Corresponding Alpaca format: +# [ +# { +# "system": "", +# "instruction": "", +# "input": "", +# "output": "", +# "history": [ +# ["human instruction in the first round (optional)", "model response in the first round (optional)"], # noqa: E501 +# ["human instruction in the second round (optional)", "model response in the second round (optional)"] # noqa: E501 +# ], +# }, +# ...... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md +# https://github.com/hiyouga/LLaMA-Factory/blob/v0.9.1/data/README.md#alpaca-format + +import json +import os + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def dj_to_alpaca( + sample, + input_key: str = 'input', + output_key: str = 'output', +): + modified_keys = {'query', 'response'} + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys and sample[key] + } + + # key mapping + if 'query' in sample: + new_sample[input_key] = sample['query'] + if 'response' in sample: + new_sample[output_key] = sample['response'] + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + input_key: str = 'input', + output_key: str = 'output', +): + """ + Convert a Data-Juicer dataset to the Alpaca-like format. + + :param src_ds_path: the path to the source dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param input_key: the field key to store the query sentence from human. + :param output_key: the field key to store the response sentence from + assistant. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.json'): + raise ValueError('Only support "json" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + samples = [] + with jl.open(src_ds_path, 'r') as reader: + for sample in tqdm(reader): + converted_sample = dj_to_alpaca(sample, + input_key=input_key, + output_key=output_key) + samples.append(converted_sample) + + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + json.dump(samples, open(tgt_ds_path, 'w', encoding='utf-8')) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py index 8bcc8207f..c72dcbb84 100644 --- a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py +++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py @@ -137,7 +137,7 @@ def main( instruction_role: str = 'instruction', ): """ - Convert a ShareGPT-like dataset to the Data-Juicer query-response format. + Convert a Data-Juicer dataset to the LLaMA-Factory ShareGPT-like format. :param src_ds_path: the path to the source dataset. :param tgt_ds_path: the path to store the converted target dataset. diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py new file mode 100644 index 000000000..d35826a63 --- /dev/null +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py @@ -0,0 +1,131 @@ +# This tool is used to convert dataset in Alpaca format to a +# target dataset in Data-Juicer query-response format. +# +# Alpaca format: +# [ +# { +# "system": "", +# "instruction": "", +# "input": "", +# "output": "", +# "history": [ +# ["human instruction in the first round (optional)", "model response in the first round (optional)"], # noqa: E501 +# ["human instruction in the second round (optional)", "model response in the second round (optional)"] # noqa: E501 +# ], +# }, +# ...... +# ] +# +# Corresponding Data-Juicer format (query-response format): +# [ +# { +# "system": "", +# "instruction": "", +# "query": "", +# "response": "", +# "history": [ +# ["human instruction in the first round (optional)", "model response in the first round (optional)"], # noqa: E501 +# ["human instruction in the second round (optional)", "model response in the second round (optional)"] # noqa: E501 +# ], +# }, +# ... +# ] +# +# Reference: +# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md +# https://github.com/hiyouga/LLaMA-Factory/blob/v0.9.1/data/README.md#alpaca-format + +import json +import os +from typing import List, Union + +import fire +import jsonlines as jl +from loguru import logger +from tqdm import tqdm + + +def alpaca_to_dj( + sample, + input_key: str = 'input', + output_key: str = 'output', + multimodal_keys: Union[str, List[str]] = None, +): + modified_keys = {input_key, output_key} + if multimodal_keys: + modified_keys = modified_keys.union(set(multimodal_keys)) + new_sample = { + key: sample[key] + for key in sample if key not in modified_keys + } + + # key mapping for input and output + if input_key in sample: + new_sample['query'] = sample[input_key] + if output_key in sample: + new_sample['response'] = sample[output_key] + + # update multimodal data + if multimodal_keys: + for mm_key in multimodal_keys: + if not isinstance(sample[mm_key], list): + new_sample[mm_key] = [sample[mm_key]] + else: + new_sample[mm_key] = sample[mm_key] + + return new_sample + + +@logger.catch(reraise=True) +def main( + src_ds_path: str, + tgt_ds_path: str, + input_key: str = 'input', + output_key: str = 'output', + multimodal_keys: Union[str, List[str]] = None, +): + """ + Convert a ModelScope-Swift ShareGPT-like dataset to the Data-Juicer + query-response format. + + :param src_ds_path: the path to the source ShareGPT-like dataset. + :param tgt_ds_path: the path to store the converted target dataset. + :param input_key: the field key to store the query sentence from human. + :param output_key: the field key to store the response sentence from + assistant. + :param multimodal_keys: optional keys to store multimodal data. + """ + + # check arguments + # check paths + if not os.path.exists(src_ds_path): + raise FileNotFoundError( + f'Input dataset [{src_ds_path}] can not be found.') + if not tgt_ds_path.endswith('.jsonl'): + raise ValueError('Only support "jsonl" target dataset file now.') + if os.path.dirname(tgt_ds_path) \ + and not os.path.exists(os.path.dirname(tgt_ds_path)): + logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' + f'for the target dataset.') + os.makedirs(os.path.dirname(tgt_ds_path)) + + if isinstance(multimodal_keys, str): + multimodal_keys = [multimodal_keys] + + # load Alpaca dataset + logger.info('Loading original dataset.') + src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8')) + logger.info(f'Load [{len(src_ds)}] samples.') + + with jl.open(tgt_ds_path, 'w') as writer: + for sample in tqdm(src_ds): + converted_sample = alpaca_to_dj(sample, + input_key=input_key, + output_key=output_key, + multimodal_keys=multimodal_keys) + writer.write(converted_sample) + logger.info(f'Store the target dataset into [{tgt_ds_path}].') + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py index c3364150b..9fced6d4c 100644 --- a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py +++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py @@ -145,7 +145,7 @@ def main( if isinstance(multimodal_keys, str): multimodal_keys = [multimodal_keys] - # load ShareGPT dataset + # load dataset logger.info('Loading original dataset.') src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8')) logger.info(f'Load [{len(src_ds)}] samples.')