Closes #894 (#895)

* Add 4option MedQA subsets Signed-off-by: katielink <[email protected]> * Add to hub_repos and fix bug Signed-off-by: katielink <[email protected]> --------- Signed-off-by: katielink <[email protected]>
bigscience-workshop · Aug 31, 2023 · 06790f1 · 06790f1
1 parent cf4779a
commit 06790f1
Show file tree

Hide file tree

Showing 2 changed files with 120 additions and 2 deletions.
diff --git a/bigbio/biodatasets/med_qa/med_qa.py b/bigbio/biodatasets/med_qa/med_qa.py
@@ -111,12 +111,47 @@ class MedQADataset(datasets.GeneratorBasedBuilder):
                 subset_id=f"med_qa_{subset}",
             )
         )
+        if subset == "en" or subset == "zh":
+            BUILDER_CONFIGS.append(
+                BigBioConfig(
+                    name=f"med_qa_{subset}_4options_source",
+                    version=SOURCE_VERSION,
+                    description=f"MedQA {_SUBSET2NAME.get(subset)} source schema (4 options)",
+                    schema="source",
+                    subset_id=f"med_qa_{subset}_4options",
+                )
+            )
+            BUILDER_CONFIGS.append(
+                BigBioConfig(
+                    name=f"med_qa_{subset}_4options_bigbio_qa",
+                    version=BIGBIO_VERSION,
+                    description=f"MedQA {_SUBSET2NAME.get(subset)} BigBio schema (4 options)",
+                    schema="bigbio_qa",
+                    subset_id=f"med_qa_{subset}_4options",
+                )
+            )
 
     DEFAULT_CONFIG_NAME = "med_qa_en_source"
 
     def _info(self) -> datasets.DatasetInfo:
 
-        if self.config.schema == "source":
+        if self.config.name == "med_qa_en_4options_source":
+            features = datasets.Features(
+                {
+                    "meta_info": datasets.Value("string"),
+                    "question": datasets.Value("string"),
+                    "answer_idx": datasets.Value("string"),
+                    "answer": datasets.Value("string"),
+                    "options": [
+                        {
+                            "key": datasets.Value("string"),
+                            "value": datasets.Value("string"),
+                        }
+                    ],
+                    "metamap_phrases": datasets.Sequence(datasets.Value("string")),
+                }
+            )
+        elif self.config.schema == "source":
             features = datasets.Features(
                 {
                     "meta_info": datasets.Value("string"),
@@ -180,6 +215,30 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
                     base_dir, "Taiwan", "tw_translated_jsonl", "zh", "dev-2zh.jsonl"
                 ),
             }
+        elif self.config.subset_id == "med_qa_en_4options":
+            paths = {
+                "train": os.path.join(
+                    base_dir, "US", "4_options", "phrases_no_exclude_train.jsonl"
+                ),
+                "test": os.path.join(
+                    base_dir, "US", "4_options", "phrases_no_exclude_test.jsonl"
+                ),
+                "valid": os.path.join(
+                    base_dir, "US", "4_options", "phrases_no_exclude_dev.jsonl"
+                ),
+            }
+        elif self.config.subset_id == "med_qa_zh_4options":
+            paths = {
+                "train": os.path.join(
+                    base_dir, "Mainland", "4_options", "train.jsonl"
+                ),
+                "test": os.path.join(
+                    base_dir, "Mainland", "4_options", "test.jsonl"
+                ),
+                "valid": os.path.join(
+                    base_dir, "Mainland", "4_options", "dev.jsonl"
+                ),
+            }
 
         return [
             datasets.SplitGenerator(

diff --git a/bigbio/hub/hub_repos/med_qa/med_qa.py b/bigbio/hub/hub_repos/med_qa/med_qa.py
@@ -110,12 +110,47 @@ class MedQADataset(datasets.GeneratorBasedBuilder):
                 subset_id=f"med_qa_{subset}",
             )
         )
+        if subset == "en" or subset == "zh":
+            BUILDER_CONFIGS.append(
+                BigBioConfig(
+                    name=f"med_qa_{subset}_4options_source",
+                    version=SOURCE_VERSION,
+                    description=f"MedQA {_SUBSET2NAME.get(subset)} source schema (4 options)",
+                    schema="source",
+                    subset_id=f"med_qa_{subset}_4options",
+                )
+            )
+            BUILDER_CONFIGS.append(
+                BigBioConfig(
+                    name=f"med_qa_{subset}_4options_bigbio_qa",
+                    version=BIGBIO_VERSION,
+                    description=f"MedQA {_SUBSET2NAME.get(subset)} BigBio schema (4 options)",
+                    schema="bigbio_qa",
+                    subset_id=f"med_qa_{subset}_4options",
+                )
+            )
 
     DEFAULT_CONFIG_NAME = "med_qa_en_source"
 
     def _info(self) -> datasets.DatasetInfo:
 
-        if self.config.schema == "source":
+        if self.config.name == "med_qa_en_4options_source":
+            features = datasets.Features(
+                {
+                    "meta_info": datasets.Value("string"),
+                    "question": datasets.Value("string"),
+                    "answer_idx": datasets.Value("string"),
+                    "answer": datasets.Value("string"),
+                    "options": [
+                        {
+                            "key": datasets.Value("string"),
+                            "value": datasets.Value("string"),
+                        }
+                    ],
+                    "metamap_phrases": datasets.Sequence(datasets.Value("string")),
+                }
+            )
+        elif self.config.schema == "source":
             features = datasets.Features(
                 {
                     "meta_info": datasets.Value("string"),
@@ -179,6 +214,30 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
                     base_dir, "Taiwan", "tw_translated_jsonl", "zh", "dev-2zh.jsonl"
                 ),
             }
+        elif self.config.subset_id == "med_qa_en_4options":
+            paths = {
+                "train": os.path.join(
+                    base_dir, "US", "4_options", "phrases_no_exclude_train.jsonl"
+                ),
+                "test": os.path.join(
+                    base_dir, "US", "4_options", "phrases_no_exclude_test.jsonl"
+                ),
+                "valid": os.path.join(
+                    base_dir, "US", "4_options", "phrases_no_exclude_dev.jsonl"
+                ),
+            }
+        elif self.config.subset_id == "med_qa_zh_4options":
+            paths = {
+                "train": os.path.join(
+                    base_dir, "Mainland", "4_options", "train.jsonl"
+                ),
+                "test": os.path.join(
+                    base_dir, "Mainland", "4_options", "test.jsonl"
+                ),
+                "valid": os.path.join(
+                    base_dir, "Mainland", "4_options", "dev.jsonl"
+                ),
+            }
 
         return [
             datasets.SplitGenerator(