From 84673cc200c382a0b4e8fc233e0ac3c5b3f566d8 Mon Sep 17 00:00:00 2001 From: qbc Date: Thu, 11 Apr 2024 14:09:25 +0800 Subject: [PATCH] add medical train --- federatedscope/llm/dataloader/dataloader.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/federatedscope/llm/dataloader/dataloader.py b/federatedscope/llm/dataloader/dataloader.py index d6f468dda..0fe75c046 100644 --- a/federatedscope/llm/dataloader/dataloader.py +++ b/federatedscope/llm/dataloader/dataloader.py @@ -279,6 +279,19 @@ def load_llm_dataset(config=None, **kwargs): list_data_dict[i]['output'] = \ list_data_dict[i]['output'].replace('####', 'The answer is') dataset = LLMDataset(list_data_dict, tokenizer) + elif dataset_name.lower() == "medical_tc": + fp = os.path.join(config.data.root, 'medical_tc_train.jsonl') + if not os.path.exists(fp): + download_url( + 'https://federatedscope.oss-cn-beijing.aliyuncs.com/FS-LLM' + '/medical_tc_train.jsonl', config.data.root) + os.rename(os.path.join(config.data.root, 'train.jsonl'), fp) + list_data_dict = load_jsonl(fp, + instruction='question', + input='input', + output='output', + category='output') + dataset = LLMDataset(list_data_dict, tokenizer) elif dataset_name.lower() == 'code_search_net': from tqdm import tqdm from federatedscope.llm.dataset.code_search_net import \