diff --git a/bigbio/biodatasets/med_qa/med_qa.py b/bigbio/biodatasets/med_qa/med_qa.py index 084226fea..3cc17d3b2 100644 --- a/bigbio/biodatasets/med_qa/med_qa.py +++ b/bigbio/biodatasets/med_qa/med_qa.py @@ -111,12 +111,47 @@ class MedQADataset(datasets.GeneratorBasedBuilder): subset_id=f"med_qa_{subset}", ) ) + if subset == "en" or subset == "zh": + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"med_qa_{subset}_4options_source", + version=SOURCE_VERSION, + description=f"MedQA {_SUBSET2NAME.get(subset)} source schema (4 options)", + schema="source", + subset_id=f"med_qa_{subset}_4options", + ) + ) + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"med_qa_{subset}_4options_bigbio_qa", + version=BIGBIO_VERSION, + description=f"MedQA {_SUBSET2NAME.get(subset)} BigBio schema (4 options)", + schema="bigbio_qa", + subset_id=f"med_qa_{subset}_4options", + ) + ) DEFAULT_CONFIG_NAME = "med_qa_en_source" def _info(self) -> datasets.DatasetInfo: - if self.config.schema == "source": + if self.config.name == "med_qa_en_4options_source": + features = datasets.Features( + { + "meta_info": datasets.Value("string"), + "question": datasets.Value("string"), + "answer_idx": datasets.Value("string"), + "answer": datasets.Value("string"), + "options": [ + { + "key": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "metamap_phrases": datasets.Sequence(datasets.Value("string")), + } + ) + elif self.config.schema == "source": features = datasets.Features( { "meta_info": datasets.Value("string"), @@ -180,6 +215,30 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: base_dir, "Taiwan", "tw_translated_jsonl", "zh", "dev-2zh.jsonl" ), } + elif self.config.subset_id == "med_qa_en_4options": + paths = { + "train": os.path.join( + base_dir, "US", "4_options", "phrases_no_exclude_train.jsonl" + ), + "test": os.path.join( + base_dir, "US", "4_options", "phrases_no_exclude_test.jsonl" + ), + "valid": os.path.join( + base_dir, "US", "4_options", "phrases_no_exclude_dev.jsonl" + ), + } + elif self.config.subset_id == "med_qa_zh_4options": + paths = { + "train": os.path.join( + base_dir, "Mainland", "4_options", "train.jsonl" + ), + "test": os.path.join( + base_dir, "Mainland", "4_options", "test.jsonl" + ), + "valid": os.path.join( + base_dir, "Mainland", "4_options", "dev.jsonl" + ), + } return [ datasets.SplitGenerator( diff --git a/bigbio/hub/hub_repos/med_qa/med_qa.py b/bigbio/hub/hub_repos/med_qa/med_qa.py index d2985a55c..b02c1c179 100644 --- a/bigbio/hub/hub_repos/med_qa/med_qa.py +++ b/bigbio/hub/hub_repos/med_qa/med_qa.py @@ -110,12 +110,47 @@ class MedQADataset(datasets.GeneratorBasedBuilder): subset_id=f"med_qa_{subset}", ) ) + if subset == "en" or subset == "zh": + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"med_qa_{subset}_4options_source", + version=SOURCE_VERSION, + description=f"MedQA {_SUBSET2NAME.get(subset)} source schema (4 options)", + schema="source", + subset_id=f"med_qa_{subset}_4options", + ) + ) + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"med_qa_{subset}_4options_bigbio_qa", + version=BIGBIO_VERSION, + description=f"MedQA {_SUBSET2NAME.get(subset)} BigBio schema (4 options)", + schema="bigbio_qa", + subset_id=f"med_qa_{subset}_4options", + ) + ) DEFAULT_CONFIG_NAME = "med_qa_en_source" def _info(self) -> datasets.DatasetInfo: - if self.config.schema == "source": + if self.config.name == "med_qa_en_4options_source": + features = datasets.Features( + { + "meta_info": datasets.Value("string"), + "question": datasets.Value("string"), + "answer_idx": datasets.Value("string"), + "answer": datasets.Value("string"), + "options": [ + { + "key": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "metamap_phrases": datasets.Sequence(datasets.Value("string")), + } + ) + elif self.config.schema == "source": features = datasets.Features( { "meta_info": datasets.Value("string"), @@ -179,6 +214,30 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: base_dir, "Taiwan", "tw_translated_jsonl", "zh", "dev-2zh.jsonl" ), } + elif self.config.subset_id == "med_qa_en_4options": + paths = { + "train": os.path.join( + base_dir, "US", "4_options", "phrases_no_exclude_train.jsonl" + ), + "test": os.path.join( + base_dir, "US", "4_options", "phrases_no_exclude_test.jsonl" + ), + "valid": os.path.join( + base_dir, "US", "4_options", "phrases_no_exclude_dev.jsonl" + ), + } + elif self.config.subset_id == "med_qa_zh_4options": + paths = { + "train": os.path.join( + base_dir, "Mainland", "4_options", "train.jsonl" + ), + "test": os.path.join( + base_dir, "Mainland", "4_options", "test.jsonl" + ), + "valid": os.path.join( + base_dir, "Mainland", "4_options", "dev.jsonl" + ), + } return [ datasets.SplitGenerator(