From 39f5b2915e1a61f0c696655249e46d4c94f834f2 Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Thu, 17 Sep 2020 16:24:52 +0530 Subject: [PATCH 01/11] done changes to get the library install from pip using git url --- README.md | 16 +++++++--------- question_generation/__init__.py | 1 + .../data_collator.py | 0 eval.py => question_generation/eval.py | 2 +- pipelines.py => question_generation/pipelines.py | 0 .../prepare_data.py | 0 run_qg.py => question_generation/run_qg.py | 12 ++++-------- trainer.py => question_generation/trainer.py | 2 +- utils.py => question_generation/utils.py | 0 9 files changed, 14 insertions(+), 19 deletions(-) create mode 100644 question_generation/__init__.py rename data_collator.py => question_generation/data_collator.py (100%) rename eval.py => question_generation/eval.py (97%) rename pipelines.py => question_generation/pipelines.py (100%) rename prepare_data.py => question_generation/prepare_data.py (100%) rename run_qg.py => question_generation/run_qg.py (96%) rename trainer.py => question_generation/trainer.py (96%) rename utils.py => question_generation/utils.py (100%) diff --git a/README.md b/README.md index 9b90eb7..c3b909a 100644 --- a/README.md +++ b/README.md @@ -131,9 +131,7 @@ The [nlg-eval](https://github.com/Maluuba/nlg-eval) package is used for calculat ## Requirements ``` -transformers==3.0.0 -nltk -nlp==0.2.0 # only if you want to fine-tune. +python -m pip install https://github.com/patil-suraj/question_generation.git ``` after installing `nltk` do @@ -154,7 +152,7 @@ The pipeline is divided into 3 tasks #### Question Generation ```python3 -from pipelines import pipeline +from question_generation import pipeline nlp = pipeline("question-generation") nlp("42 is the answer to life, the universe and everything.") @@ -224,7 +222,7 @@ The datasets will be saved in `data/` directory. You should provide filenames us **process data for single task question generation with highlight_qg_format** ```bash -python prepare_data.py \ +python question_generation.prepare_data.py \ --task qg \ --model_type t5 \ --dataset_path data/squad_multitask/ \ @@ -240,7 +238,7 @@ python prepare_data.py \ `valid_for_qg_only` argument is used to decide if the validation set should only contain data for qg task. For my multi-task experiments I used validation data with only qg task so that the eval loss curve can be easly compared with other single task models ```bash -python prepare_data.py \ +python question_generation.prepare_data.py \ --task multi \ --valid_for_qg_only \ --model_type t5 \ @@ -254,7 +252,7 @@ python prepare_data.py \ **process dataset for end-to-end question generation** ```bash -python prepare_data.py \ +python question_generation.prepare_data.py \ --task e2e_qg \ --valid_for_qg_only \ --model_type t5 \ @@ -271,7 +269,7 @@ Use the `run_qg.py` script to start training. It uses transformers `Trainer` cl ```bash -python run_qg.py \ +python question_generation.run_qg.py \ --model_name_or_path t5-small \ --model_type t5 \ --tokenizer_name_or_path t5_qg_tokenizer \ @@ -323,7 +321,7 @@ run_qg(args_dict) Use the `eval.py` script for evaluting the model. ```bash -python eval.py \ +python question_generation.eval.py \ --model_name_or_path t5-base-qg-hl \ --valid_file_path valid_data_qg_hl_t5.pt \ --model_type t5 \ diff --git a/question_generation/__init__.py b/question_generation/__init__.py new file mode 100644 index 0000000..d6c0bd4 --- /dev/null +++ b/question_generation/__init__.py @@ -0,0 +1 @@ +from .pipelines import pipeline \ No newline at end of file diff --git a/data_collator.py b/question_generation/data_collator.py similarity index 100% rename from data_collator.py rename to question_generation/data_collator.py diff --git a/eval.py b/question_generation/eval.py similarity index 97% rename from eval.py rename to question_generation/eval.py index 4b59c6e..bce408a 100644 --- a/eval.py +++ b/question_generation/eval.py @@ -6,7 +6,7 @@ from tqdm.auto import tqdm from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, HfArgumentParser -from data_collator import T2TDataCollator +from question_generation.data_collator import T2TDataCollator device = 'cuda' if torch.cuda.is_available else 'cpu' diff --git a/pipelines.py b/question_generation/pipelines.py similarity index 100% rename from pipelines.py rename to question_generation/pipelines.py diff --git a/prepare_data.py b/question_generation/prepare_data.py similarity index 100% rename from prepare_data.py rename to question_generation/prepare_data.py diff --git a/run_qg.py b/question_generation/run_qg.py similarity index 96% rename from run_qg.py rename to question_generation/run_qg.py index 20b8abe..f52da4b 100644 --- a/run_qg.py +++ b/question_generation/run_qg.py @@ -1,28 +1,24 @@ -import dataclasses import json import logging import os import sys from dataclasses import dataclass, field -from typing import Dict, List, Optional +from typing import Optional -import numpy as np import torch from transformers import ( AutoModelForSeq2SeqLM, - AutoTokenizer, T5Tokenizer, BartTokenizer, HfArgumentParser, - DataCollator, TrainingArguments, set_seed, ) -from trainer import Trainer -from data_collator import T2TDataCollator -from utils import freeze_embeds, assert_not_all_frozen +from question_generation.trainer import Trainer +from question_generation.data_collator import T2TDataCollator +from question_generation.utils import freeze_embeds, assert_not_all_frozen MODEL_TYPE_TO_TOKENIZER = { "t5": T5Tokenizer, diff --git a/trainer.py b/question_generation/trainer.py similarity index 96% rename from trainer.py rename to question_generation/trainer.py index 29612d8..67711a6 100644 --- a/trainer.py +++ b/question_generation/trainer.py @@ -9,7 +9,7 @@ if is_apex_available(): from apex import amp -from utils import label_smoothed_nll_loss +from question_generation.utils import label_smoothed_nll_loss class Trainer(HFTrainer): def __init__(self, label_smoothing: float = 0, **kwargs): diff --git a/utils.py b/question_generation/utils.py similarity index 100% rename from utils.py rename to question_generation/utils.py From 54b79c935e89226bef4afd9b3a6ff498b137050f Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Thu, 17 Sep 2020 16:34:40 +0530 Subject: [PATCH 02/11] added setup files --- README.md | 2 +- requirements.txt | 3 +++ setup.cfg | 8 ++++++++ setup.py | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/README.md b/README.md index c3b909a..fe49857 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ The [nlg-eval](https://github.com/Maluuba/nlg-eval) package is used for calculat ## Requirements ``` -python -m pip install https://github.com/patil-suraj/question_generation.git +python -m pip install git+https://github.com/patil-suraj/question_generation.git ``` after installing `nltk` do diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ba0da64 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +transformers>=3.0.0 +nltk +nlp>=0.2.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..a253057 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,8 @@ +[metadata] +license = MIT +license-file = LICENSE +description-file = README.md +platform = any + +[bdist_wheel] +universal = 1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a43d184 --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +from setuptools import setup, find_packages + +from smart_config import __version__ + +with open("README.md", "r") as f: + long_description = f.read() + +setup( + name="question_generation", + packages=find_packages(), + version=__version__, + url="https://github.com/patil-suraj/question_generation", + license="MIT", + author="Suraj Patil", + author_email="surajp815@gmail.com", + description="Question generation is the task of automatically generating questions from a text paragraph.", + install_requires=["transformers>=3.0.0", "nltk", "nlp>=0.2.0"], + python_requires=">=3.6", + include_package_data=True, + platforms="any", + long_description=long_description, + long_description_content_type="text/markdown", + classifiers=[ + "Operating System :: OS Independent", + "License :: OSI Approved :: MIT License", + "Topic :: Utilities", + "Intended Audience :: Developers", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + ], +) \ No newline at end of file From af63982d0bfab7e07a0b7459557e3c36428d18eb Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Thu, 17 Sep 2020 16:47:50 +0530 Subject: [PATCH 03/11] done document fixes and minor changes --- README.md | 12 ++++++------ question_generation/__init__.py | 5 ++++- setup.py | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index fe49857..515801e 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,7 @@ The datasets will be saved in `data/` directory. You should provide filenames us **process data for single task question generation with highlight_qg_format** ```bash -python question_generation.prepare_data.py \ +python -m question_generation.prepare_data.py \ --task qg \ --model_type t5 \ --dataset_path data/squad_multitask/ \ @@ -238,7 +238,7 @@ python question_generation.prepare_data.py \ `valid_for_qg_only` argument is used to decide if the validation set should only contain data for qg task. For my multi-task experiments I used validation data with only qg task so that the eval loss curve can be easly compared with other single task models ```bash -python question_generation.prepare_data.py \ +python -m question_generation.prepare_data.py \ --task multi \ --valid_for_qg_only \ --model_type t5 \ @@ -252,7 +252,7 @@ python question_generation.prepare_data.py \ **process dataset for end-to-end question generation** ```bash -python question_generation.prepare_data.py \ +python -m question_generation.prepare_data.py \ --task e2e_qg \ --valid_for_qg_only \ --model_type t5 \ @@ -269,7 +269,7 @@ Use the `run_qg.py` script to start training. It uses transformers `Trainer` cl ```bash -python question_generation.run_qg.py \ +python -m question_generation.run_qg.py \ --model_name_or_path t5-small \ --model_type t5 \ --tokenizer_name_or_path t5_qg_tokenizer \ @@ -291,7 +291,7 @@ python question_generation.run_qg.py \ or if you want to train it from script or notebook then ```python3 -from run_qg import run_qg +from question_generation import run_qg args_dict = { "model_name_or_path": "t5-small", @@ -321,7 +321,7 @@ run_qg(args_dict) Use the `eval.py` script for evaluting the model. ```bash -python question_generation.eval.py \ +python -m question_generation.eval.py \ --model_name_or_path t5-base-qg-hl \ --valid_file_path valid_data_qg_hl_t5.pt \ --model_type t5 \ diff --git a/question_generation/__init__.py b/question_generation/__init__.py index d6c0bd4..e129ec7 100644 --- a/question_generation/__init__.py +++ b/question_generation/__init__.py @@ -1 +1,4 @@ -from .pipelines import pipeline \ No newline at end of file +from .pipelines import pipeline +from .run_qg import run_qg + +__version__ = "0.1.0" diff --git a/setup.py b/setup.py index a43d184..e76b20d 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -from smart_config import __version__ +from question_generation import __version__ with open("README.md", "r") as f: long_description = f.read() From ae0bbb0aeb7fe7f63615c0109eee23894fe5079c Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Mon, 21 Sep 2020 15:20:19 +0530 Subject: [PATCH 04/11] Update requirements.txt added missing dependency --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ba0da64..5ccc3fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ transformers>=3.0.0 nltk -nlp>=0.2.0 \ No newline at end of file +nlp>=0.2.0 +torch From 3575c5a8620f54a6cff592da48757ac1d983a341 Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Mon, 21 Sep 2020 15:21:00 +0530 Subject: [PATCH 05/11] Update setup.py added missing dependency --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e76b20d..fd619a6 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ author="Suraj Patil", author_email="surajp815@gmail.com", description="Question generation is the task of automatically generating questions from a text paragraph.", - install_requires=["transformers>=3.0.0", "nltk", "nlp>=0.2.0"], + install_requires=["transformers>=3.0.0", "nltk", "nlp>=0.2.0", "torch"], python_requires=">=3.6", include_package_data=True, platforms="any", @@ -30,4 +30,4 @@ "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", ], -) \ No newline at end of file +) From 56f4963f20b19964cf6f496072a5eb35db0c3af6 Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Mon, 21 Sep 2020 15:21:19 +0530 Subject: [PATCH 06/11] Create python-publish.yml --- .github/workflows/python-publish.yml | 33 ++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..199343c --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,33 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + pip install -r requirements.txt + - name: Build + run: | + python setup.py sdist bdist_wheel + - name: Archive pytest results + uses: actions/upload-artifact@v1 + with: + name: distribution + path: dist/* From f104cd2ef7e2d0a52125ef95a4363d005b537dbe Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Mon, 21 Sep 2020 15:33:37 +0530 Subject: [PATCH 07/11] achive stage name changed --- .github/workflows/python-publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 199343c..35f73a2 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -26,7 +26,7 @@ jobs: - name: Build run: | python setup.py sdist bdist_wheel - - name: Archive pytest results + - name: Archive Distribution Files uses: actions/upload-artifact@v1 with: name: distribution From e710128634b34b9850f9be22004aee8bec9fadee Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Mon, 21 Sep 2020 16:19:28 +0530 Subject: [PATCH 08/11] done changes in file path --- .github/workflows/python-publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 35f73a2..ac2d547 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -30,4 +30,4 @@ jobs: uses: actions/upload-artifact@v1 with: name: distribution - path: dist/* + path: /home/runner/work/question_generation/dist From fbe7a77f82001fcbcf09a15f75008670c39e8c4a Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Mon, 21 Sep 2020 16:24:05 +0530 Subject: [PATCH 09/11] done changes in flow --- .github/workflows/python-publish.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index ac2d547..67d5d01 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -4,8 +4,10 @@ name: Upload Python Package on: - release: - types: [created] +# release: +# types: [created] + push: + branches: [ master ] jobs: deploy: From 0e5c2872e3848a3b4cc37b6237fd112de211f808 Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Mon, 21 Sep 2020 16:29:42 +0530 Subject: [PATCH 10/11] done changes in dist path --- .github/workflows/python-publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 67d5d01..dfd6532 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -32,4 +32,4 @@ jobs: uses: actions/upload-artifact@v1 with: name: distribution - path: /home/runner/work/question_generation/dist + path: ./dist From bb99426936ee88174c7f2f3959c9832c6732c17c Mon Sep 17 00:00:00 2001 From: Fahad Ali Shaikh Date: Mon, 21 Sep 2020 17:43:23 +0530 Subject: [PATCH 11/11] updated file --- .github/workflows/python-publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index dfd6532..84c96cc 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -31,5 +31,5 @@ jobs: - name: Archive Distribution Files uses: actions/upload-artifact@v1 with: - name: distribution - path: ./dist + name: wheel + path: ./dist/question_generation-**.whl