Skip to content

Commit

Permalink
feat: add first model (#5)
Browse files Browse the repository at this point in the history
* feat: refactoring raw data
* feat: create first model
  • Loading branch information
leomaurodesenv authored Nov 7, 2023
1 parent 38daa5f commit a545d0d
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 28 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This is a learning repository about DVC Data Version Control and Luigi Pipelines

- luigi, dvc, pre-commit
- setup https://pre-commit.com/
- https://luigi.readthedocs.io/
- setup https://pre-commit.com/, https://pre-commit.com/hooks.html
- setup https://github.com/Kaggle/kaggle-api
- `kaggle competitions download -c sentiment-analysis-on-movie-reviews -p data`
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ pre-commit==3.5.0
kaggle==1.5.16
dvc==3.28.0
luigi==3.4.0
pandas==2.1.2
scikit-learn==1.3.2
32 changes: 32 additions & 0 deletions source/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import luigi
import zipfile


class SentimentAnalysisZipFile(luigi.ExternalTask):
"""
Raw data zip file
"""

def output(self):
return luigi.LocalTarget("../data/sentiment-analysis-on-movie-reviews.zip")


class ExtractRawData(luigi.Task):
"""
Extract raw data from zip file
"""

def requires(self):
return SentimentAnalysisZipFile()

def output(self):
return {
"test": luigi.LocalTarget("../data/output/test.tsv.zip"),
"train": luigi.LocalTarget("../data/output/train.tsv.zip"),
"sampleSubission": luigi.LocalTarget("../data/output/sampleSubmission.csv"),
}

def run(self):
# Unzip data file
with zipfile.ZipFile(self.input().path, "r") as zip_ref:
zip_ref.extractall("../data/output/")
27 changes: 0 additions & 27 deletions source/get_raw_data.py

This file was deleted.

89 changes: 89 additions & 0 deletions source/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import luigi
import joblib
import pandas as pd
import scipy.sparse
from data import ExtractRawData
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier


class Preprocessing(luigi.Task):
"""
NLP Preprocessing
"""

def requires(self):
return ExtractRawData()

def output(self):
return {
"X": luigi.LocalTarget("../data/output/preprocessing.npz"),
"vectorizer": luigi.LocalTarget("../data/output/preprocessing.joblib"),
}

def run(self):
dataset = self.input()
train = pd.read_csv(dataset["train"].path, sep="\t")
corpus = train["Phrase"]
vectorizer = CountVectorizer(
lowercase=True, ngram_range=(1, 2), max_features=10_000
)
X = vectorizer.fit_transform(corpus)
# storing results
scipy.sparse.save_npz(self.output()["X"].path, X)
joblib.dump(vectorizer, self.output()["vectorizer"].path)


class TrainModel(luigi.Task):
"""
Train model
"""

def requires(self):
return {
"data": ExtractRawData(),
"preprocessing": Preprocessing(),
}

def output(self):
return luigi.LocalTarget("../data/output/model.joblib")

def run(self):
_input = self.input()
X = scipy.sparse.load_npz(_input["preprocessing"]["X"].path)
y = pd.read_csv(_input["data"]["train"].path, sep="\t")["Sentiment"]
model = MLPClassifier(
max_iter=500,
hidden_layer_sizes=(512, 256),
early_stopping=True,
random_state=29,
verbose=True,
)
model.fit(X, y)
joblib.dump(model, self.output().path)


class Predict(luigi.Task):
"""
Predict
"""

def requires(self):
return {
"data": ExtractRawData(),
"preprocessing": Preprocessing(),
"model": TrainModel(),
}

def output(self):
return luigi.LocalTarget("../data/output/submission.csv")

def run(self):
_input = self.input()
model = joblib.load(_input["model"].path)
vectorizer = joblib.load(_input["preprocessing"]["vectorizer"].path)
test = pd.read_csv(_input["data"]["test"].path, sep="\t")
# Predicting
test_X = vectorizer.transform(test["Phrase"].fillna(""))
test["Sentiment"] = model.predict(test_X)
test[["PhraseId", "Sentiment"]].to_csv(self.output().path, index=False)

0 comments on commit a545d0d

Please sign in to comment.