Skip to content

Commit

Permalink
feat: refactoring raw data
Browse files Browse the repository at this point in the history
  • Loading branch information
leomaurodesenv committed Nov 6, 2023
1 parent 38daa5f commit 31321a0
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 28 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This is a learning repository about DVC Data Version Control and Luigi Pipelines

- luigi, dvc, pre-commit
- setup https://pre-commit.com/
- https://luigi.readthedocs.io/
- setup https://pre-commit.com/, https://pre-commit.com/hooks.html
- setup https://github.com/Kaggle/kaggle-api
- `kaggle competitions download -c sentiment-analysis-on-movie-reviews -p data`
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ pre-commit==3.5.0
kaggle==1.5.16
dvc==3.28.0
luigi==3.4.0
scikit-learn==1.3.2
32 changes: 32 additions & 0 deletions source/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import luigi
import zipfile


class SentimentAnalysisZipFile(luigi.ExternalTask):
"""
Raw data zip file
"""

def output(self):
return luigi.LocalTarget("../data/sentiment-analysis-on-movie-reviews.zip")


class ExtractRawData(luigi.Task):
"""
Extract raw data from zip file
"""

def requires(self):
return SentimentAnalysisZipFile()

def output(self):
return {
"test": luigi.LocalTarget("../data/output/test.tsv.zip"),
"train": luigi.LocalTarget("../data/output/train.tsv.zip"),
"sampleSubission": luigi.LocalTarget("../data/output/sampleSubmission.csv"),
}

def run(self):
# Unzip data file
with zipfile.ZipFile(self.input().path, "r") as zip_ref:
zip_ref.extractall("../data/output/")
27 changes: 0 additions & 27 deletions source/get_raw_data.py

This file was deleted.

1 change: 1 addition & 0 deletions source/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import luigi

0 comments on commit 31321a0

Please sign in to comment.