feat: Add re-phrasing of questions

alexandrainst · Apr 19, 2024 · ac13342 · ac13342
1 parent c81b47c
commit ac13342
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 43 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -9,34 +9,50 @@ dirs:
   final: final
 
 seed: 4242
-min_article_length: 100
 
-push_to_hub: false
-hub_id: alexandrainst/foqa
+# Data configuration
+min_article_length: 1000
+num_samples: 1000
 
+# Model configuration
 model: gpt-4-turbo-2024-04-09
 max_tokens: 1024
 temperature: 1.0
+push_to_hub: false
+hub_id: alexandrainst/foqa
+
 system_prompt: >
   You are a helpful Faroese question answering dataset generator. The only language you
   know is Faroese.
+
 prompt: |
   The following is a Wikipedia article in Faroese.
 
   <article>
   {article}
   </article>
 
-  Generate 2 to 5 questions about the article, which are all answered in the article.
+  Generate 2 to 10 questions about the article, depending on the length of the article, all of which answered in the article.
 
   You also have to supply answers to the questions, and the answers have to appear exactly as written in the article (including same casing).
 
-  The answers have to be different from each other.
-
-  Your questions must be hard to answer.
+  The answers should only contain the answers themselves, and not the surrounding sentence - keep the answers as short as possible.
 
-  Rephrase your question using synonyms, to avoid using phrases that are used directly in the article.
+  The answers have to be different from each other.
 
   All your questions and answers must be in Faroese.
 
   Your answer must be a JSON dictionary with the key "results", with the value being a list of dictionaries having keys "question" and "answer".
+
+follow_up_prompt: >
+  The following is a Faroese question.
+
+  <question>
+  {question}
+  </question>
+
+  Re-write the question, preserving the meaning, using synonyms or a different (valid) word order.
+
+  Your question must be in Faroese.
+
+  Your answer must be a JSON dictionary with the key "question".
diff --git a/src/foqa/dataset.py b/src/foqa/dataset.py
@@ -22,9 +22,14 @@ def build_dataset(config: DictConfig) -> None:
             The Hydra configuration object.
     """
     logger.info("Loading the Faroese Wikipedia dataset...")
-    dataset = load_dataset(
-        "alexandrainst/scandi-wiki", "fo", split="train", trust_remote_code=True
-    ).shuffle(seed=config.seed)
+    dataset = (
+        load_dataset(
+            "alexandrainst/scandi-wiki", "fo", split="train", trust_remote_code=True
+        )
+        .shuffle(seed=config.seed)
+        .filter(lambda x: len(x["text"]) > config.min_article_length)
+        .select(range(config.num_samples))
+    )
 
     records_path = Path(config.dirs.data) / config.dirs.raw / "records.jsonl"
     if records_path.exists():
@@ -33,40 +38,40 @@ def build_dataset(config: DictConfig) -> None:
     else:
         records = list()
 
-    for sample in tqdm(dataset, desc=f"Generating samples with {config.model}"):
-        sample_exists = any(record["id"] == sample["url"] for record in records)
-        if sample_exists:
-            continue
+    with tqdm(dataset, desc=f"Generating samples with {config.model}") as pbar:
+        for sample in pbar:
+            sample_exists = any(record["id"] == sample["url"] for record in records)
+            if sample_exists:
+                continue
 
-        sample_is_too_short = len(sample["text"]) < config.min_article_length
-        if sample_is_too_short:
-            continue
-
-        try:
-            generated_samples = get_json_generation(
-                article=sample["text"], config=config
-            )
-        except Exception as e:
-            logger.info(
-                f"Failed to generate samples for {sample['url']} with error message "
-                f"{str(e)}. Skipping."
-            )
-            continue
-
-        with records_path.open("a") as f:
-            for generated_sample in generated_samples:
-                record = dict(
-                    id=sample["url"],
-                    title=sample["title"],
-                    context=sample["text"],
-                    question=generated_sample["question"],
-                    answers=dict(
-                        text=[generated_sample["answer"]],
-                        answer_start=sample["text"].find(generated_sample["answer"]),
-                    ),
+            try:
+                generated_samples = get_json_generation(
+                    article=sample["text"], config=config
+                )
+            except Exception as e:
+                logger.info(
+                    f"Failed to generate samples for {sample['url']} with error "
+                    f"{type(e)}: {e}. Skipping."
                 )
-                records.append(record)
-                f.write(json.dumps(record) + "\n")
+                continue
+
+            with records_path.open("a") as f:
+                for generated_sample in generated_samples:
+                    record = dict(
+                        id=sample["url"],
+                        title=sample["title"],
+                        context=sample["text"],
+                        question=generated_sample["question"],
+                        answers=dict(
+                            text=[generated_sample["answer"]],
+                            answer_start=sample["text"].find(
+                                generated_sample["answer"]
+                            ),
+                        ),
+                    )
+                    records.append(record)
+                    f.write(json.dumps(record) + "\n")
+                    pbar.set_postfix(samples_generated=len(records))
 
     logger.info("Converting the records to a Hugging Face dataset...")
     df = pd.DataFrame.from_records(records)

diff --git a/src/foqa/openai.py b/src/foqa/openai.py
@@ -70,4 +70,30 @@ def get_json_generation(article: str, config: DictConfig) -> list[dict[str, str]
         ):
             generated_samples.append(generated_sample)
 
+    # Re-phrase the generated questions
+    for generated_sample in generated_samples:
+        model_output = client.chat.completions.create(
+            messages=[
+                ChatCompletionSystemMessageParam(
+                    role="system", content=config.system_prompt
+                ),
+                ChatCompletionUserMessageParam(
+                    role="user",
+                    content=config.follow_up_prompt.format(
+                        question=generated_sample["question"]
+                    ),
+                ),
+            ],
+            model=config.model,
+            max_tokens=config.max_tokens,
+            temperature=config.temperature,
+            response_format=ResponseFormat(type="json_object"),
+        )
+        generation_output = model_output.choices[0].message.content
+        assert isinstance(generation_output, str)
+        json_obj = json.loads(generation_output)
+        assert isinstance(json_obj, dict) and "question" in json_obj
+        generated_sample["original_question"] = generated_sample["question"]
+        generated_sample["question"] = json_obj["question"]
+
     return generated_samples