diff --git a/README.md b/README.md index 4696a10..84e678e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # dvc-luigi This is a learning repository about DVC Data Version Control and Luigi Pipelines +- luigi, dvc, pre-commit - setup https://github.com/Kaggle/kaggle-api -- `kaggle competitions download -c sentiment-analysis-on-movie-reviews -p data` \ No newline at end of file +- `kaggle competitions download -c sentiment-analysis-on-movie-reviews -p data` diff --git a/data/.gitignore b/data/.gitignore index d8ee7f0..a3b5fb6 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -1,2 +1,3 @@ +/output/ /data.xml /sentiment-analysis-on-movie-reviews.zip diff --git a/requirements.txt b/requirements.txt index 7216041..a00e0b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ kaggle==1.5.16 dvc==3.28.0 +luigi==3.4.0 \ No newline at end of file diff --git a/source/__init__.py b/source/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/source/get_raw_data.py b/source/get_raw_data.py new file mode 100644 index 0000000..0ccc80c --- /dev/null +++ b/source/get_raw_data.py @@ -0,0 +1,20 @@ +import os +import luigi +import zipfile + +class ExtractRawData(luigi.Task): + data_path = luigi.Parameter(default="../data/sentiment-analysis-on-movie-reviews.zip") + + def output(self): + return { + "test": luigi.LocalTarget('../data/output/test.tsv.zip'), + "train": luigi.LocalTarget('../data/output/train.tsv.zip'), + } + + def run(self): + # Check if data file exists + assert os.path.exists(self.data_path) + + # Unzip data file + with zipfile.ZipFile(self.data_path, 'r') as zip_ref: + zip_ref.extractall("../data/output/")