Skip to content

Commit

Permalink
feat: Add re-phrasing of questions
Browse files Browse the repository at this point in the history
  • Loading branch information
saattrupdan committed Apr 19, 2024
1 parent c81b47c commit ac13342
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 43 deletions.
32 changes: 24 additions & 8 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,50 @@ dirs:
final: final

seed: 4242
min_article_length: 100

push_to_hub: false
hub_id: alexandrainst/foqa
# Data configuration
min_article_length: 1000
num_samples: 1000

# Model configuration
model: gpt-4-turbo-2024-04-09
max_tokens: 1024
temperature: 1.0
push_to_hub: false
hub_id: alexandrainst/foqa

system_prompt: >
You are a helpful Faroese question answering dataset generator. The only language you
know is Faroese.
prompt: |
The following is a Wikipedia article in Faroese.
<article>
{article}
</article>
Generate 2 to 5 questions about the article, which are all answered in the article.
Generate 2 to 10 questions about the article, depending on the length of the article, all of which answered in the article.
You also have to supply answers to the questions, and the answers have to appear exactly as written in the article (including same casing).
The answers have to be different from each other.
Your questions must be hard to answer.
The answers should only contain the answers themselves, and not the surrounding sentence - keep the answers as short as possible.
Rephrase your question using synonyms, to avoid using phrases that are used directly in the article.
The answers have to be different from each other.
All your questions and answers must be in Faroese.
Your answer must be a JSON dictionary with the key "results", with the value being a list of dictionaries having keys "question" and "answer".
follow_up_prompt: >
The following is a Faroese question.
<question>
{question}
</question>
Re-write the question, preserving the meaning, using synonyms or a different (valid) word order.
Your question must be in Faroese.
Your answer must be a JSON dictionary with the key "question".
75 changes: 40 additions & 35 deletions src/foqa/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@ def build_dataset(config: DictConfig) -> None:
The Hydra configuration object.
"""
logger.info("Loading the Faroese Wikipedia dataset...")
dataset = load_dataset(
"alexandrainst/scandi-wiki", "fo", split="train", trust_remote_code=True
).shuffle(seed=config.seed)
dataset = (
load_dataset(
"alexandrainst/scandi-wiki", "fo", split="train", trust_remote_code=True
)
.shuffle(seed=config.seed)
.filter(lambda x: len(x["text"]) > config.min_article_length)
.select(range(config.num_samples))
)

records_path = Path(config.dirs.data) / config.dirs.raw / "records.jsonl"
if records_path.exists():
Expand All @@ -33,40 +38,40 @@ def build_dataset(config: DictConfig) -> None:
else:
records = list()

for sample in tqdm(dataset, desc=f"Generating samples with {config.model}"):
sample_exists = any(record["id"] == sample["url"] for record in records)
if sample_exists:
continue
with tqdm(dataset, desc=f"Generating samples with {config.model}") as pbar:
for sample in pbar:
sample_exists = any(record["id"] == sample["url"] for record in records)
if sample_exists:
continue

sample_is_too_short = len(sample["text"]) < config.min_article_length
if sample_is_too_short:
continue

try:
generated_samples = get_json_generation(
article=sample["text"], config=config
)
except Exception as e:
logger.info(
f"Failed to generate samples for {sample['url']} with error message "
f"{str(e)}. Skipping."
)
continue

with records_path.open("a") as f:
for generated_sample in generated_samples:
record = dict(
id=sample["url"],
title=sample["title"],
context=sample["text"],
question=generated_sample["question"],
answers=dict(
text=[generated_sample["answer"]],
answer_start=sample["text"].find(generated_sample["answer"]),
),
try:
generated_samples = get_json_generation(
article=sample["text"], config=config
)
except Exception as e:
logger.info(
f"Failed to generate samples for {sample['url']} with error "
f"{type(e)}: {e}. Skipping."
)
records.append(record)
f.write(json.dumps(record) + "\n")
continue

with records_path.open("a") as f:
for generated_sample in generated_samples:
record = dict(
id=sample["url"],
title=sample["title"],
context=sample["text"],
question=generated_sample["question"],
answers=dict(
text=[generated_sample["answer"]],
answer_start=sample["text"].find(
generated_sample["answer"]
),
),
)
records.append(record)
f.write(json.dumps(record) + "\n")
pbar.set_postfix(samples_generated=len(records))

logger.info("Converting the records to a Hugging Face dataset...")
df = pd.DataFrame.from_records(records)
Expand Down
26 changes: 26 additions & 0 deletions src/foqa/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,30 @@ def get_json_generation(article: str, config: DictConfig) -> list[dict[str, str]
):
generated_samples.append(generated_sample)

# Re-phrase the generated questions
for generated_sample in generated_samples:
model_output = client.chat.completions.create(
messages=[
ChatCompletionSystemMessageParam(
role="system", content=config.system_prompt
),
ChatCompletionUserMessageParam(
role="user",
content=config.follow_up_prompt.format(
question=generated_sample["question"]
),
),
],
model=config.model,
max_tokens=config.max_tokens,
temperature=config.temperature,
response_format=ResponseFormat(type="json_object"),
)
generation_output = model_output.choices[0].message.content
assert isinstance(generation_output, str)
json_obj = json.loads(generation_output)
assert isinstance(json_obj, dict) and "question" in json_obj
generated_sample["original_question"] = generated_sample["question"]
generated_sample["question"] = json_obj["question"]

return generated_samples

0 comments on commit ac13342

Please sign in to comment.