Skip to content

Commit

Permalink
feat: create first model
Browse files Browse the repository at this point in the history
  • Loading branch information
leomaurodesenv committed Nov 7, 2023
1 parent 31321a0 commit a251df9
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 1 deletion.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ pre-commit==3.5.0
kaggle==1.5.16
dvc==3.28.0
luigi==3.4.0
pandas==2.1.2
scikit-learn==1.3.2
89 changes: 89 additions & 0 deletions source/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import luigi
import joblib
import pandas as pd
import scipy.sparse
from data import ExtractRawData
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier


class Preprocessing(luigi.Task):
"""
NLP Preprocessing
"""

def requires(self):
return ExtractRawData()

def output(self):
return {
"X": luigi.LocalTarget("../data/output/preprocessing.npz"),
"vectorizer": luigi.LocalTarget("../data/output/preprocessing.joblib"),
}

def run(self):
dataset = self.input()
train = pd.read_csv(dataset["train"].path, sep="\t")
corpus = train["Phrase"]
vectorizer = CountVectorizer(
lowercase=True, ngram_range=(1, 2), max_features=10_000
)
X = vectorizer.fit_transform(corpus)
# storing results
scipy.sparse.save_npz(self.output()["X"].path, X)
joblib.dump(vectorizer, self.output()["vectorizer"].path)


class TrainModel(luigi.Task):
"""
Train model
"""

def requires(self):
return {
"data": ExtractRawData(),
"preprocessing": Preprocessing(),
}

def output(self):
return luigi.LocalTarget("../data/output/model.joblib")

def run(self):
_input = self.input()
X = scipy.sparse.load_npz(_input["preprocessing"]["X"].path)
y = pd.read_csv(_input["data"]["train"].path, sep="\t")["Sentiment"]
model = MLPClassifier(
max_iter=500,
hidden_layer_sizes=(512, 256),
early_stopping=True,
random_state=29,
verbose=True,
)
model.fit(X, y)
joblib.dump(model, self.output().path)


class Predict(luigi.Task):
"""
Predict
"""

def requires(self):
return {
"data": ExtractRawData(),
"preprocessing": Preprocessing(),
"model": TrainModel(),
}

def output(self):
return luigi.LocalTarget("../data/output/submission.csv")

def run(self):
_input = self.input()
model = joblib.load(_input["model"].path)
vectorizer = joblib.load(_input["preprocessing"]["vectorizer"].path)
test = pd.read_csv(_input["data"]["test"].path, sep="\t")
# Predicting
test_X = vectorizer.transform(test["Phrase"].fillna(""))
test["Sentiment"] = model.predict(test_X)
test[["PhraseId", "Sentiment"]].to_csv(self.output().path, index=False)
1 change: 0 additions & 1 deletion source/preprocessing.py

This file was deleted.

0 comments on commit a251df9

Please sign in to comment.