From 31321a067a2fad4b926d6c12d6d6c4b72249642b Mon Sep 17 00:00:00 2001 From: leomaurodesenv Date: Mon, 6 Nov 2023 19:47:22 -0300 Subject: [PATCH] feat: refactoring raw data --- README.md | 3 ++- requirements.txt | 1 + source/data.py | 32 ++++++++++++++++++++++++++++++++ source/get_raw_data.py | 27 --------------------------- source/preprocessing.py | 1 + 5 files changed, 36 insertions(+), 28 deletions(-) create mode 100644 source/data.py delete mode 100644 source/get_raw_data.py create mode 100644 source/preprocessing.py diff --git a/README.md b/README.md index 0d77e61..3adf309 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ This is a learning repository about DVC Data Version Control and Luigi Pipelines - luigi, dvc, pre-commit -- setup https://pre-commit.com/ +- https://luigi.readthedocs.io/ +- setup https://pre-commit.com/, https://pre-commit.com/hooks.html - setup https://github.com/Kaggle/kaggle-api - `kaggle competitions download -c sentiment-analysis-on-movie-reviews -p data` diff --git a/requirements.txt b/requirements.txt index c0d9d92..c1c7b9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ pre-commit==3.5.0 kaggle==1.5.16 dvc==3.28.0 luigi==3.4.0 +scikit-learn==1.3.2 diff --git a/source/data.py b/source/data.py new file mode 100644 index 0000000..1a97140 --- /dev/null +++ b/source/data.py @@ -0,0 +1,32 @@ +import luigi +import zipfile + + +class SentimentAnalysisZipFile(luigi.ExternalTask): + """ + Raw data zip file + """ + + def output(self): + return luigi.LocalTarget("../data/sentiment-analysis-on-movie-reviews.zip") + + +class ExtractRawData(luigi.Task): + """ + Extract raw data from zip file + """ + + def requires(self): + return SentimentAnalysisZipFile() + + def output(self): + return { + "test": luigi.LocalTarget("../data/output/test.tsv.zip"), + "train": luigi.LocalTarget("../data/output/train.tsv.zip"), + "sampleSubission": luigi.LocalTarget("../data/output/sampleSubmission.csv"), + } + + def run(self): + # Unzip data file + with zipfile.ZipFile(self.input().path, "r") as zip_ref: + zip_ref.extractall("../data/output/") diff --git a/source/get_raw_data.py b/source/get_raw_data.py deleted file mode 100644 index b721833..0000000 --- a/source/get_raw_data.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import luigi -import zipfile - - -class ExtractRawData(luigi.Task): - """ - Extract raw data from zip file - """ - - data_path = luigi.Parameter( - default="../data/sentiment-analysis-on-movie-reviews.zip" - ) - - def output(self): - return { - "test": luigi.LocalTarget("../data/output/test.tsv.zip"), - "train": luigi.LocalTarget("../data/output/train.tsv.zip"), - } - - def run(self): - # Check if data file exists - assert os.path.exists(self.data_path) - - # Unzip data file - with zipfile.ZipFile(self.data_path, "r") as zip_ref: - zip_ref.extractall("../data/output/") diff --git a/source/preprocessing.py b/source/preprocessing.py new file mode 100644 index 0000000..63ede75 --- /dev/null +++ b/source/preprocessing.py @@ -0,0 +1 @@ +import luigi