-
Notifications
You must be signed in to change notification settings - Fork 191
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
+ add conversion tools for ModelScope-Swift ShareGPT format
- Loading branch information
Showing
6 changed files
with
315 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
143 changes: 143 additions & 0 deletions
143
...version/post_tuning_dialog/data_juicer_format_to_target_format/dj_to_ms_swift_sharegpt.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
# This tool is used to convert dataset in Data-Juicer format to a | ||
# target dataset in ModelScope-Swift ShareGPT format. | ||
# | ||
# Data-Juicer format (query-response format): | ||
# [ | ||
# { | ||
# "system": "<system>", | ||
# "query": "<query2>", | ||
# "response": "<response2>" | ||
# "history": [ | ||
# [ | ||
# "<query1>", | ||
# "<response1>" | ||
# ], | ||
# ] | ||
# }, | ||
# ... | ||
# ] | ||
# | ||
# Corresponding ModelScope-Swift ShareGPT format: | ||
# [ | ||
# { | ||
# "system": "<system>", | ||
# "conversation": [ | ||
# { | ||
# "human": "<query1>", | ||
# "assistant": "<response1>" | ||
# }, | ||
# { | ||
# "human": "<query2>", | ||
# "assistant": "<response2>" | ||
# } | ||
# ] | ||
# }, | ||
# ...... | ||
# ] | ||
# | ||
# Reference: | ||
# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md | ||
|
||
import json | ||
import os | ||
|
||
import fire | ||
import jsonlines as jl | ||
from loguru import logger | ||
from tqdm import tqdm | ||
|
||
|
||
def dj_to_ms_swift_sharegpt( | ||
sample, | ||
conversation_key: str = 'conversation', | ||
human_key: str = 'human', | ||
assistant_key: str = 'assistant', | ||
system_key: str = 'system', | ||
instruction_key: str = 'instruction', | ||
): | ||
modified_keys = {'query', 'response', 'history', 'system', 'instruction'} | ||
new_sample = { | ||
key: sample[key] | ||
for key in sample if key not in modified_keys | ||
} | ||
|
||
# find system prompt and instruction | ||
if 'system' in sample: | ||
new_sample[system_key] = sample['system'] | ||
if 'instruction' in sample: | ||
new_sample[instruction_key] = sample['instruction'] | ||
|
||
# construct conversation | ||
conversation = [] | ||
# add dialogs | ||
for query, response in sample['history']: | ||
conversation.append({ | ||
human_key: query, | ||
assistant_key: response, | ||
}) | ||
conversation.append({ | ||
human_key: | ||
sample['query'], | ||
assistant_key: | ||
sample['response'] if 'response' in sample else '' | ||
}) | ||
|
||
new_sample[conversation_key] = conversation | ||
|
||
return new_sample | ||
|
||
|
||
@logger.catch(reraise=True) | ||
def main( | ||
src_ds_path: str, | ||
tgt_ds_path: str, | ||
conversation_key: str = 'conversation', | ||
human_key: str = 'human', | ||
assistant_key: str = 'assistant', | ||
system_key: str = 'system', | ||
instruction_key: str = 'instruction', | ||
): | ||
""" | ||
Convert a Data-Juicer query-response dataset to the ModelScope-Swift | ||
ShareGPT-like format. | ||
:param src_ds_path: the path to the source dataset. | ||
:param tgt_ds_path: the path to store the converted target dataset. | ||
:param conversation_key: the field key to store conversions. | ||
:param human_key: the field key to store the sentence from human. | ||
:param assistant_key: the field key to store the sentence from assistant. | ||
:param system_key: the field key to store the system prompt. | ||
:param instruction_key: the field key to store the instruction content. | ||
""" | ||
|
||
# check arguments | ||
# check paths | ||
if not os.path.exists(src_ds_path): | ||
raise FileNotFoundError( | ||
f'Input dataset [{src_ds_path}] can not be found.') | ||
if not tgt_ds_path.endswith('.json'): | ||
raise ValueError('Only support "json" target dataset file now.') | ||
if os.path.dirname(tgt_ds_path) \ | ||
and not os.path.exists(os.path.dirname(tgt_ds_path)): | ||
logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' | ||
f'for the target dataset.') | ||
os.makedirs(os.path.dirname(tgt_ds_path)) | ||
|
||
# load dataset | ||
samples = [] | ||
with jl.open(src_ds_path, 'r') as reader: | ||
for sample in tqdm(reader): | ||
converted_sample = dj_to_ms_swift_sharegpt( | ||
sample, | ||
conversation_key=conversation_key, | ||
human_key=human_key, | ||
assistant_key=assistant_key, | ||
system_key=system_key, | ||
instruction_key=instruction_key) | ||
samples.append(converted_sample) | ||
logger.info(f'Store the target dataset into [{tgt_ds_path}].') | ||
json.dump(samples, open(tgt_ds_path, 'w', encoding='utf-8')) | ||
|
||
|
||
if __name__ == '__main__': | ||
fire.Fire(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
168 changes: 168 additions & 0 deletions
168
...version/post_tuning_dialog/source_format_to_data_juicer_format/ms_swift_sharegpt_to_dj.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
# This tool is used to convert dataset in ModelScope-Swift ShareGPT format to a | ||
# target dataset in Data-Juicer query-response format. | ||
# | ||
# ModelScope-Swift ShareGPT format: | ||
# [ | ||
# { | ||
# "system": "<system>", | ||
# "conversation": [ | ||
# { | ||
# "human": "<query1>", | ||
# "assistant": "<response1>" | ||
# }, | ||
# { | ||
# "human": "<query2>", | ||
# "assistant": "<response2>" | ||
# } | ||
# ] | ||
# }, | ||
# ...... | ||
# ] | ||
# | ||
# Corresponding Data-Juicer format (query-response format): | ||
# [ | ||
# { | ||
# "system": "<system>", | ||
# "query": "<query2>", | ||
# "response": "<response2>" | ||
# "history": [ | ||
# [ | ||
# "<query1>", | ||
# "<response1>" | ||
# ], | ||
# ] | ||
# }, | ||
# ... | ||
# ] | ||
# | ||
# Reference: | ||
# https://github.com/modelscope/ms-swift/blob/main/docs/source_en/Customization/Custom-dataset.md | ||
|
||
import json | ||
import os | ||
from typing import List, Union | ||
|
||
import fire | ||
import jsonlines as jl | ||
from loguru import logger | ||
from tqdm import tqdm | ||
|
||
|
||
def ms_swift_sharegpt_to_dj( | ||
sample, | ||
conversation_key: str = 'conversation', | ||
human_key: str = 'human', | ||
assistant_key: str = 'assistant', | ||
system_key: str = 'system', | ||
instruction_key: str = 'instruction', | ||
multimodal_keys: Union[str, List[str]] = None, | ||
): | ||
modified_keys = {conversation_key, system_key, instruction_key} | ||
if multimodal_keys: | ||
modified_keys = modified_keys.union(set(multimodal_keys)) | ||
new_sample = { | ||
key: sample[key] | ||
for key in sample if key not in modified_keys | ||
} | ||
|
||
# find system prompt and instruction | ||
if system_key in sample: | ||
new_sample['system'] = sample[system_key] | ||
if instruction_key in sample: | ||
new_sample['instruction'] = sample[instruction_key] | ||
|
||
# conversations to query, response, history | ||
conversation = sample[conversation_key] | ||
# reconstruct conversations | ||
conv_num = len(conversation) | ||
if conv_num == 0: | ||
query = '' | ||
response = '' | ||
history = [] | ||
else: | ||
# the last 1 sentence is query and response is empty | ||
query = conversation[-1][human_key] | ||
response = conversation[-1][assistant_key] | ||
history = [[conv[human_key], conv[assistant_key]] | ||
for conv in conversation[:-1]] | ||
|
||
# get the result sample | ||
new_sample.update({ | ||
'query': query, | ||
'response': response, | ||
'history': history, | ||
}) | ||
|
||
# update multimodal data | ||
if multimodal_keys: | ||
for mm_key in multimodal_keys: | ||
if not isinstance(sample[mm_key], list): | ||
new_sample[mm_key] = [sample[mm_key]] | ||
else: | ||
new_sample[mm_key] = sample[mm_key] | ||
|
||
return new_sample | ||
|
||
|
||
@logger.catch(reraise=True) | ||
def main( | ||
src_ds_path: str, | ||
tgt_ds_path: str, | ||
conversation_key: str = 'conversation', | ||
human_key: str = 'human', | ||
assistant_key: str = 'assistant', | ||
system_key: str = 'system', | ||
instruction_key: str = 'instruction', | ||
multimodal_keys: Union[str, List[str]] = None, | ||
): | ||
""" | ||
Convert a ModelScope-Swift ShareGPT-like dataset to the Data-Juicer | ||
query-response format. | ||
:param src_ds_path: the path to the source ShareGPT-like dataset. | ||
:param tgt_ds_path: the path to store the converted target dataset. | ||
:param conversation_key: the field key to store conversions. | ||
:param human_key: the field key to store the sentence from human. | ||
:param assistant_key: the field key to store the sentence from assistant. | ||
:param system_key: the field key to store the system prompt. | ||
:param instruction_key: the field key to store the instruction content. | ||
:param multimodal_keys: optional keys to store multimodal data. | ||
""" | ||
|
||
# check arguments | ||
# check paths | ||
if not os.path.exists(src_ds_path): | ||
raise FileNotFoundError( | ||
f'Input dataset [{src_ds_path}] can not be found.') | ||
if not tgt_ds_path.endswith('.jsonl'): | ||
raise ValueError('Only support "jsonl" target dataset file now.') | ||
if os.path.dirname(tgt_ds_path) \ | ||
and not os.path.exists(os.path.dirname(tgt_ds_path)): | ||
logger.info(f'Create directory [{os.path.dirname(tgt_ds_path)}] ' | ||
f'for the target dataset.') | ||
os.makedirs(os.path.dirname(tgt_ds_path)) | ||
|
||
if isinstance(multimodal_keys, str): | ||
multimodal_keys = [multimodal_keys] | ||
|
||
# load ShareGPT dataset | ||
logger.info('Loading original dataset.') | ||
src_ds = json.load(open(src_ds_path, 'r', encoding='utf-8')) | ||
logger.info(f'Load [{len(src_ds)}] samples.') | ||
|
||
with jl.open(tgt_ds_path, 'w') as writer: | ||
for sample in tqdm(src_ds): | ||
converted_sample = ms_swift_sharegpt_to_dj( | ||
sample, | ||
conversation_key=conversation_key, | ||
human_key=human_key, | ||
assistant_key=assistant_key, | ||
system_key=system_key, | ||
instruction_key=instruction_key, | ||
multimodal_keys=multimodal_keys) | ||
writer.write(converted_sample) | ||
logger.info(f'Store the target dataset into [{tgt_ds_path}].') | ||
|
||
|
||
if __name__ == '__main__': | ||
fire.Fire(main) |