Skip to content

Commit

Permalink
Closes #894 (#895)
Browse files Browse the repository at this point in the history
* Add 4option MedQA subsets

Signed-off-by: katielink <[email protected]>

* Add to hub_repos and fix bug

Signed-off-by: katielink <[email protected]>

---------

Signed-off-by: katielink <[email protected]>
  • Loading branch information
katielink authored Aug 31, 2023
1 parent cf4779a commit 06790f1
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 2 deletions.
61 changes: 60 additions & 1 deletion bigbio/biodatasets/med_qa/med_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,47 @@ class MedQADataset(datasets.GeneratorBasedBuilder):
subset_id=f"med_qa_{subset}",
)
)
if subset == "en" or subset == "zh":
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"med_qa_{subset}_4options_source",
version=SOURCE_VERSION,
description=f"MedQA {_SUBSET2NAME.get(subset)} source schema (4 options)",
schema="source",
subset_id=f"med_qa_{subset}_4options",
)
)
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"med_qa_{subset}_4options_bigbio_qa",
version=BIGBIO_VERSION,
description=f"MedQA {_SUBSET2NAME.get(subset)} BigBio schema (4 options)",
schema="bigbio_qa",
subset_id=f"med_qa_{subset}_4options",
)
)

DEFAULT_CONFIG_NAME = "med_qa_en_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":
if self.config.name == "med_qa_en_4options_source":
features = datasets.Features(
{
"meta_info": datasets.Value("string"),
"question": datasets.Value("string"),
"answer_idx": datasets.Value("string"),
"answer": datasets.Value("string"),
"options": [
{
"key": datasets.Value("string"),
"value": datasets.Value("string"),
}
],
"metamap_phrases": datasets.Sequence(datasets.Value("string")),
}
)
elif self.config.schema == "source":
features = datasets.Features(
{
"meta_info": datasets.Value("string"),
Expand Down Expand Up @@ -180,6 +215,30 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
base_dir, "Taiwan", "tw_translated_jsonl", "zh", "dev-2zh.jsonl"
),
}
elif self.config.subset_id == "med_qa_en_4options":
paths = {
"train": os.path.join(
base_dir, "US", "4_options", "phrases_no_exclude_train.jsonl"
),
"test": os.path.join(
base_dir, "US", "4_options", "phrases_no_exclude_test.jsonl"
),
"valid": os.path.join(
base_dir, "US", "4_options", "phrases_no_exclude_dev.jsonl"
),
}
elif self.config.subset_id == "med_qa_zh_4options":
paths = {
"train": os.path.join(
base_dir, "Mainland", "4_options", "train.jsonl"
),
"test": os.path.join(
base_dir, "Mainland", "4_options", "test.jsonl"
),
"valid": os.path.join(
base_dir, "Mainland", "4_options", "dev.jsonl"
),
}

return [
datasets.SplitGenerator(
Expand Down
61 changes: 60 additions & 1 deletion bigbio/hub/hub_repos/med_qa/med_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,47 @@ class MedQADataset(datasets.GeneratorBasedBuilder):
subset_id=f"med_qa_{subset}",
)
)
if subset == "en" or subset == "zh":
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"med_qa_{subset}_4options_source",
version=SOURCE_VERSION,
description=f"MedQA {_SUBSET2NAME.get(subset)} source schema (4 options)",
schema="source",
subset_id=f"med_qa_{subset}_4options",
)
)
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"med_qa_{subset}_4options_bigbio_qa",
version=BIGBIO_VERSION,
description=f"MedQA {_SUBSET2NAME.get(subset)} BigBio schema (4 options)",
schema="bigbio_qa",
subset_id=f"med_qa_{subset}_4options",
)
)

DEFAULT_CONFIG_NAME = "med_qa_en_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":
if self.config.name == "med_qa_en_4options_source":
features = datasets.Features(
{
"meta_info": datasets.Value("string"),
"question": datasets.Value("string"),
"answer_idx": datasets.Value("string"),
"answer": datasets.Value("string"),
"options": [
{
"key": datasets.Value("string"),
"value": datasets.Value("string"),
}
],
"metamap_phrases": datasets.Sequence(datasets.Value("string")),
}
)
elif self.config.schema == "source":
features = datasets.Features(
{
"meta_info": datasets.Value("string"),
Expand Down Expand Up @@ -179,6 +214,30 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
base_dir, "Taiwan", "tw_translated_jsonl", "zh", "dev-2zh.jsonl"
),
}
elif self.config.subset_id == "med_qa_en_4options":
paths = {
"train": os.path.join(
base_dir, "US", "4_options", "phrases_no_exclude_train.jsonl"
),
"test": os.path.join(
base_dir, "US", "4_options", "phrases_no_exclude_test.jsonl"
),
"valid": os.path.join(
base_dir, "US", "4_options", "phrases_no_exclude_dev.jsonl"
),
}
elif self.config.subset_id == "med_qa_zh_4options":
paths = {
"train": os.path.join(
base_dir, "Mainland", "4_options", "train.jsonl"
),
"test": os.path.join(
base_dir, "Mainland", "4_options", "test.jsonl"
),
"valid": os.path.join(
base_dir, "Mainland", "4_options", "dev.jsonl"
),
}

return [
datasets.SplitGenerator(
Expand Down

0 comments on commit 06790f1

Please sign in to comment.