From 7519a658b2b2fd23b1bf21742072f54f72a9564b Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 23 Dec 2024 21:03:33 +0800
Subject: [PATCH] + add conversion tools for Alpaca format

---
 .../dj_to_alpaca.py                           | 110 +++++++++++++++
 .../dj_to_llama_factory_sharegpt.py           |   2 +-
 .../alpaca_to_dj.py                           | 131 ++++++++++++++++++
 .../ms_swift_sharegpt_to_dj.py                |   2 +-
 4 files changed, 243 insertions(+), 2 deletions(-)
 create mode 100644 tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py
 create mode 100644 tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py
diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py
new file mode 100644
index 000000000..f79fd0c43
--- /dev/null
+++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_alpaca.py
@@ -0,0 +1,110 @@
+# This tool is used to convert dataset in Data-Juicer format to a
+# target dataset in Alpaca-like format.
+#
+# Data-Juicer format (query-response format):
+# [
+#   {
+#     "system": "<system>",
+#     "instruction": "<query-inst>",
+#     "query": "<query-input>",
+#     "response": "<response>",
+#     "history": [
+#       ["human instruction in the first round (optional)", "model response in the first round (optional)"],  # noqa: E501
+#       ["human instruction in the second round (optional)", "model response in the second round (optional)"]  # noqa: E501
+#     ],
+#   },
+#   ...
+# ]
+#
+# Corresponding Alpaca format:
+# [
+#   {
+#     "system": "<system>",
+#     "instruction": "<query-inst>",
+#     "input": "<query-input>",
+#     "output": "<response>",
+#     "history": [
+#       ["human instruction in the first round (optional)", "model response in the first round (optional)"],  # noqa: E501
+#       ["human instruction in the second round (optional)", "model response in the second round (optional)"]  # noqa: E501
+#     ],
+#   },
+#   ......
+# ]
+#
+# Reference:
+# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md
+# https://github.com/hiyouga/LLaMA-Factory/blob/v0.9.1/data/README.md#alpaca-format
+
+import json
+import os
+
+import fire
+import jsonlines as jl
+from loguru import logger
+from tqdm import tqdm
+
+
+def dj_to_alpaca(
+    sample,
+    input_key: str = 'input',
+    output_key: str = 'output',
+):
+    modified_keys = {'query', 'response'}
+    new_sample = {
+        key: sample[key]
+        for key in sample if key not in modified_keys and sample[key]
+    }
+
+    # key mapping
+    if 'query' in sample:
+        new_sample[input_key] = sample['query']
+    if 'response' in sample:
+        new_sample[output_key] = sample['response']
+
+    return new_sample
+
+
+@logger.catch(reraise=True)
+def main(
+    src_ds_path: str,
+    tgt_ds_path: str,
+    input_key: str = 'input',
+    output_key: str = 'output',
+):
+    """
+    Convert a Data-Juicer dataset to the Alpaca-like format.
+
+    :param src_ds_path: the path to the source dataset.
+    :param tgt_ds_path: the path to store the converted target dataset.
+    :param input_key: the field key to store the query sentence from human.
+    :param output_key: the field key to store the response sentence from
+        assistant.
+    """
+
+    # check arguments
+    # check paths
+    if not os.path.exists(src_ds_path):
+        raise FileNotFoundError(
+            f'Input dataset [{src_ds_path}] can not be found.')
+    if not tgt_ds_path.endswith('.json'):
+        raise ValueError('Only support "json" target dataset file now.')
+    if os.path.dirname(tgt_ds_path) \
+            and not os.path.exists(os.path.dirname(tgt_ds_path)):
+        logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] '
+                    f'for the target dataset.')
+        os.makedirs(os.path.dirname(tgt_ds_path))
+
+    samples = []
+    with jl.open(src_ds_path, 'r') as reader:
+        for sample in tqdm(reader):
+            converted_sample = dj_to_alpaca(sample,
+                                            input_key=input_key,
+                                            output_key=output_key)
+            samples.append(converted_sample)
+
+    logger.info(f'Store the target dataset into [{tgt_ds_path}].')
+    json.dump(samples, open(tgt_ds_path, 'w', encoding='utf-8'))
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py
index 8bcc8207f..c72dcbb84 100644
--- a/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py
+++ b/tools/fmt_conversion/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_llama_factory_sharegpt.py
@@ -137,7 +137,7 @@ def main(
     instruction_role: str = 'instruction',
 ):
     """
-    Convert a ShareGPT-like dataset to the Data-Juicer query-response format.
+    Convert a Data-Juicer dataset to the LLaMA-Factory ShareGPT-like format.
 
     :param src_ds_path: the path to the source dataset.
     :param tgt_ds_path: the path to store the converted target dataset.
diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py
new file mode 100644
index 000000000..d35826a63
--- /dev/null
+++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/alpaca_to_dj.py
@@ -0,0 +1,131 @@
+# This tool is used to convert dataset in Alpaca format to a
+# target dataset in Data-Juicer query-response format.
+#
+# Alpaca format:
+# [
+#   {
+#     "system": "<system>",
+#     "instruction": "<query-inst>",
+#     "input": "<query-input>",
+#     "output": "<response>",
+#     "history": [
+#       ["human instruction in the first round (optional)", "model response in the first round (optional)"],  # noqa: E501
+#       ["human instruction in the second round (optional)", "model response in the second round (optional)"]  # noqa: E501
+#     ],
+#   },
+#   ......
+# ]
+#
+# Corresponding Data-Juicer format (query-response format):
+# [
+#   {
+#     "system": "<system>",
+#     "instruction": "<query-inst>",
+#     "query": "<query-input>",
+#     "response": "<response>",
+#     "history": [
+#       ["human instruction in the first round (optional)", "model response in the first round (optional)"],  # noqa: E501
+#       ["human instruction in the second round (optional)", "model response in the second round (optional)"]  # noqa: E501
+#     ],
+#   },
+#   ...
+# ]
+#
+# Reference:
+# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md
+# https://github.com/hiyouga/LLaMA-Factory/blob/v0.9.1/data/README.md#alpaca-format
+
+import json
+import os
+from typing import List, Union
+
+import fire
+import jsonlines as jl
+from loguru import logger
+from tqdm import tqdm
+
+
+def alpaca_to_dj(
+    sample,
+    input_key: str = 'input',
+    output_key: str = 'output',
+    multimodal_keys: Union[str, List[str]] = None,
+):
+    modified_keys = {input_key, output_key}
+    if multimodal_keys:
+        modified_keys = modified_keys.union(set(multimodal_keys))
+    new_sample = {
+        key: sample[key]
+        for key in sample if key not in modified_keys
+    }
+
+    # key mapping for input and output
+    if input_key in sample:
+        new_sample['query'] = sample[input_key]
+    if output_key in sample:
+        new_sample['response'] = sample[output_key]
+
+    # update multimodal data
+    if multimodal_keys:
+        for mm_key in multimodal_keys:
+            if not isinstance(sample[mm_key], list):
+                new_sample[mm_key] = [sample[mm_key]]
+            else:
+                new_sample[mm_key] = sample[mm_key]
+
+    return new_sample
+
+
+@logger.catch(reraise=True)
+def main(
+    src_ds_path: str,
+    tgt_ds_path: str,
+    input_key: str = 'input',
+    output_key: str = 'output',
+    multimodal_keys: Union[str, List[str]] = None,
+):
+    """
+    Convert a ModelScope-Swift ShareGPT-like dataset to the Data-Juicer
+    query-response format.
+
+    :param src_ds_path: the path to the source ShareGPT-like dataset.
+    :param tgt_ds_path: the path to store the converted target dataset.
+    :param input_key: the field key to store the query sentence from human.
+    :param output_key: the field key to store the response sentence from
+        assistant.
+    :param multimodal_keys: optional keys to store multimodal data.
+    """
+
+    # check arguments
+    # check paths
+    if not os.path.exists(src_ds_path):
+        raise FileNotFoundError(
+            f'Input dataset [{src_ds_path}] can not be found.')
+    if not tgt_ds_path.endswith('.jsonl'):
+        raise ValueError('Only support "jsonl" target dataset file now.')
+    if os.path.dirname(tgt_ds_path) \
+            and not os.path.exists(os.path.dirname(tgt_ds_path)):
+        logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] '
+                    f'for the target dataset.')
+        os.makedirs(os.path.dirname(tgt_ds_path))
+
+    if isinstance(multimodal_keys, str):
+        multimodal_keys = [multimodal_keys]
+
+    # load Alpaca dataset
+    logger.info('Loading original dataset.')
+    src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8'))
+    logger.info(f'Load [{len(src_ds)}] samples.')
+
+    with jl.open(tgt_ds_path, 'w') as writer:
+        for sample in tqdm(src_ds):
+            converted_sample = alpaca_to_dj(sample,
+                                            input_key=input_key,
+                                            output_key=output_key,
+                                            multimodal_keys=multimodal_keys)
+            writer.write(converted_sample)
+    logger.info(f'Store the target dataset into [{tgt_ds_path}].')
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py
index c3364150b..9fced6d4c 100644
--- a/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py
+++ b/tools/fmt_conversion/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py
@@ -145,7 +145,7 @@ def main(
     if isinstance(multimodal_keys, str):
         multimodal_keys = [multimodal_keys]
 
-    # load ShareGPT dataset
+    # load dataset
     logger.info('Loading original dataset.')
     src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8'))
     logger.info(f'Load [{len(src_ds)}] samples.')