diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..84c96cc --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,35 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: +# release: +# types: [created] + push: + branches: [ master ] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + pip install -r requirements.txt + - name: Build + run: | + python setup.py sdist bdist_wheel + - name: Archive Distribution Files + uses: actions/upload-artifact@v1 + with: + name: wheel + path: ./dist/question_generation-**.whl diff --git a/README.md b/README.md index b07e261..f1ddb7f 100644 --- a/README.md +++ b/README.md @@ -131,9 +131,7 @@ The [nlg-eval](https://github.com/Maluuba/nlg-eval) package is used for calculat ## Requirements ``` -transformers==3.0.0 -nltk -nlp==0.2.0 # only if you want to fine-tune. +python -m pip install git+https://github.com/patil-suraj/question_generation.git ``` after installing `nltk` do @@ -154,7 +152,7 @@ The pipeline is divided into 3 tasks #### Question Generation ```python3 -from pipelines import pipeline +from question_generation import pipeline nlp = pipeline("question-generation") nlp("42 is the answer to life, the universe and everything.") @@ -224,7 +222,7 @@ The datasets will be saved in `data/` directory. You should provide filenames us **process data for single task question generation with highlight_qg_format** ```bash -python prepare_data.py \ +python -m question_generation.prepare_data.py \ --task qg \ --model_type t5 \ --dataset_path data/squad_multitask/ \ @@ -240,7 +238,7 @@ python prepare_data.py \ `valid_for_qg_only` argument is used to decide if the validation set should only contain data for qg task. For my multi-task experiments I used validation data with only qg task so that the eval loss curve can be easly compared with other single task models ```bash -python prepare_data.py \ +python -m question_generation.prepare_data.py \ --task multi \ --valid_for_qg_only \ --model_type t5 \ @@ -254,7 +252,7 @@ python prepare_data.py \ **process dataset for end-to-end question generation** ```bash -python prepare_data.py \ +python -m question_generation.prepare_data.py \ --task e2e_qg \ --valid_for_qg_only \ --model_type t5 \ @@ -271,7 +269,7 @@ Use the `run_qg.py` script to start training. It uses transformers `Trainer` cl ```bash -python run_qg.py \ +python -m question_generation.run_qg.py \ --model_name_or_path t5-small \ --model_type t5 \ --tokenizer_name_or_path t5_qg_tokenizer \ @@ -293,7 +291,7 @@ python run_qg.py \ or if you want to train it from script or notebook then ```python3 -from run_qg import run_qg +from question_generation import run_qg args_dict = { "model_name_or_path": "t5-small", @@ -323,7 +321,7 @@ run_qg(args_dict) Use the `eval.py` script for evaluting the model. ```bash -python eval.py \ +python -m question_generation.eval.py \ --model_name_or_path t5-base-qg-hl \ --valid_file_path valid_data_qg_hl_t5.pt \ --model_type t5 \ diff --git a/question_generation/__init__.py b/question_generation/__init__.py new file mode 100644 index 0000000..e129ec7 --- /dev/null +++ b/question_generation/__init__.py @@ -0,0 +1,4 @@ +from .pipelines import pipeline +from .run_qg import run_qg + +__version__ = "0.1.0" diff --git a/data_collator.py b/question_generation/data_collator.py similarity index 100% rename from data_collator.py rename to question_generation/data_collator.py diff --git a/eval.py b/question_generation/eval.py similarity index 97% rename from eval.py rename to question_generation/eval.py index 4b59c6e..bce408a 100644 --- a/eval.py +++ b/question_generation/eval.py @@ -6,7 +6,7 @@ from tqdm.auto import tqdm from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, HfArgumentParser -from data_collator import T2TDataCollator +from question_generation.data_collator import T2TDataCollator device = 'cuda' if torch.cuda.is_available else 'cpu' diff --git a/pipelines.py b/question_generation/pipelines.py similarity index 100% rename from pipelines.py rename to question_generation/pipelines.py diff --git a/prepare_data.py b/question_generation/prepare_data.py similarity index 100% rename from prepare_data.py rename to question_generation/prepare_data.py diff --git a/run_qg.py b/question_generation/run_qg.py similarity index 96% rename from run_qg.py rename to question_generation/run_qg.py index 20b8abe..f52da4b 100644 --- a/run_qg.py +++ b/question_generation/run_qg.py @@ -1,28 +1,24 @@ -import dataclasses import json import logging import os import sys from dataclasses import dataclass, field -from typing import Dict, List, Optional +from typing import Optional -import numpy as np import torch from transformers import ( AutoModelForSeq2SeqLM, - AutoTokenizer, T5Tokenizer, BartTokenizer, HfArgumentParser, - DataCollator, TrainingArguments, set_seed, ) -from trainer import Trainer -from data_collator import T2TDataCollator -from utils import freeze_embeds, assert_not_all_frozen +from question_generation.trainer import Trainer +from question_generation.data_collator import T2TDataCollator +from question_generation.utils import freeze_embeds, assert_not_all_frozen MODEL_TYPE_TO_TOKENIZER = { "t5": T5Tokenizer, diff --git a/trainer.py b/question_generation/trainer.py similarity index 96% rename from trainer.py rename to question_generation/trainer.py index 29612d8..67711a6 100644 --- a/trainer.py +++ b/question_generation/trainer.py @@ -9,7 +9,7 @@ if is_apex_available(): from apex import amp -from utils import label_smoothed_nll_loss +from question_generation.utils import label_smoothed_nll_loss class Trainer(HFTrainer): def __init__(self, label_smoothing: float = 0, **kwargs): diff --git a/utils.py b/question_generation/utils.py similarity index 100% rename from utils.py rename to question_generation/utils.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5ccc3fe --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +transformers>=3.0.0 +nltk +nlp>=0.2.0 +torch diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..a253057 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,8 @@ +[metadata] +license = MIT +license-file = LICENSE +description-file = README.md +platform = any + +[bdist_wheel] +universal = 1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fd619a6 --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +from setuptools import setup, find_packages + +from question_generation import __version__ + +with open("README.md", "r") as f: + long_description = f.read() + +setup( + name="question_generation", + packages=find_packages(), + version=__version__, + url="https://github.com/patil-suraj/question_generation", + license="MIT", + author="Suraj Patil", + author_email="surajp815@gmail.com", + description="Question generation is the task of automatically generating questions from a text paragraph.", + install_requires=["transformers>=3.0.0", "nltk", "nlp>=0.2.0", "torch"], + python_requires=">=3.6", + include_package_data=True, + platforms="any", + long_description=long_description, + long_description_content_type="text/markdown", + classifiers=[ + "Operating System :: OS Independent", + "License :: OSI Approved :: MIT License", + "Topic :: Utilities", + "Intended Audience :: Developers", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + ], +)