Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use datasets instead of nlp. And add requirements.txt. #115

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions data/squad_multitask/squad_multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import nltk
nltk.download('punkt')

import nlp
import datasets


_CITATION = """\
Expand Down Expand Up @@ -56,7 +56,7 @@
]


class SquadMultitaskConfig(nlp.BuilderConfig):
class SquadMultitaskConfig(datasets.BuilderConfig):
"""BuilderConfig for SQUAD."""

def __init__(self, qg_format="highlight", **kwargs):
Expand All @@ -69,7 +69,7 @@ def __init__(self, qg_format="highlight", **kwargs):
self.qg_format = qg_format


class SquadMultitask(nlp.GeneratorBasedBuilder):
class SquadMultitask(datasets.GeneratorBasedBuilder):
"""SQUAD: The Stanford Question Answering Dataset. Version 1.1."""

_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
Expand All @@ -79,21 +79,21 @@ class SquadMultitask(nlp.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
SquadMultitaskConfig(
name=f"{format_}_qg_format",
version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
version=datasets.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
description="Plain text",
qg_format=format_
)
for format_ in QG_FORMATS
]

def _info(self):
return nlp.DatasetInfo(
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=nlp.Features(
features=datasets.Features(
{
"source_text": nlp.Value("string"),
"target_text": nlp.Value("string"),
"task": nlp.Value("string"),
"source_text": datasets.Value("string"),
"target_text": datasets.Value("string"),
"task": datasets.Value("string"),
}
),
# No default supervised_keys (as we have to pass both question
Expand All @@ -111,8 +111,8 @@ def _split_generators(self, dl_manager):
downloaded_files = dl_manager.download_and_extract(urls_to_download)

return [
nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
]

def _get_correct_alignement(self, context, answer):
Expand Down
6 changes: 3 additions & 3 deletions prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Dict, List, Optional

import torch
import nlp
import datasets
from transformers import T5Tokenizer, BartTokenizer, HfArgumentParser


Expand Down Expand Up @@ -152,8 +152,8 @@ def main():

tokenizer.add_tokens(['<sep>', '<hl>'])

train_dataset = nlp.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=nlp.Split.TRAIN)
valid_dataset = nlp.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=nlp.Split.VALIDATION)
train_dataset = datasets.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=datasets.Split.TRAIN)
valid_dataset = datasets.load_dataset(data_args.dataset_path, name=data_args.qg_format, split=datasets.Split.VALIDATION)

processor = DataProcessor(
tokenizer,
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
transformers>=3.0.0
nltk
torch
datasets>=2.12.0