From a545d0d3cf3f607c63396afda9de02efb8fe836d Mon Sep 17 00:00:00 2001 From: Leonardo Mauro Date: Mon, 6 Nov 2023 21:22:07 -0300 Subject: [PATCH] feat: add first model (#5) * feat: refactoring raw data * feat: create first model --- README.md | 3 +- requirements.txt | 2 + source/data.py | 32 +++++++++++++++ source/get_raw_data.py | 27 ------------- source/model.py | 89 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 125 insertions(+), 28 deletions(-) create mode 100644 source/data.py delete mode 100644 source/get_raw_data.py create mode 100644 source/model.py diff --git a/README.md b/README.md index 0d77e61..3adf309 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ This is a learning repository about DVC Data Version Control and Luigi Pipelines - luigi, dvc, pre-commit -- setup https://pre-commit.com/ +- https://luigi.readthedocs.io/ +- setup https://pre-commit.com/, https://pre-commit.com/hooks.html - setup https://github.com/Kaggle/kaggle-api - `kaggle competitions download -c sentiment-analysis-on-movie-reviews -p data` diff --git a/requirements.txt b/requirements.txt index c0d9d92..0b8ec5c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,5 @@ pre-commit==3.5.0 kaggle==1.5.16 dvc==3.28.0 luigi==3.4.0 +pandas==2.1.2 +scikit-learn==1.3.2 diff --git a/source/data.py b/source/data.py new file mode 100644 index 0000000..1a97140 --- /dev/null +++ b/source/data.py @@ -0,0 +1,32 @@ +import luigi +import zipfile + + +class SentimentAnalysisZipFile(luigi.ExternalTask): + """ + Raw data zip file + """ + + def output(self): + return luigi.LocalTarget("../data/sentiment-analysis-on-movie-reviews.zip") + + +class ExtractRawData(luigi.Task): + """ + Extract raw data from zip file + """ + + def requires(self): + return SentimentAnalysisZipFile() + + def output(self): + return { + "test": luigi.LocalTarget("../data/output/test.tsv.zip"), + "train": luigi.LocalTarget("../data/output/train.tsv.zip"), + "sampleSubission": luigi.LocalTarget("../data/output/sampleSubmission.csv"), + } + + def run(self): + # Unzip data file + with zipfile.ZipFile(self.input().path, "r") as zip_ref: + zip_ref.extractall("../data/output/") diff --git a/source/get_raw_data.py b/source/get_raw_data.py deleted file mode 100644 index b721833..0000000 --- a/source/get_raw_data.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import luigi -import zipfile - - -class ExtractRawData(luigi.Task): - """ - Extract raw data from zip file - """ - - data_path = luigi.Parameter( - default="../data/sentiment-analysis-on-movie-reviews.zip" - ) - - def output(self): - return { - "test": luigi.LocalTarget("../data/output/test.tsv.zip"), - "train": luigi.LocalTarget("../data/output/train.tsv.zip"), - } - - def run(self): - # Check if data file exists - assert os.path.exists(self.data_path) - - # Unzip data file - with zipfile.ZipFile(self.data_path, "r") as zip_ref: - zip_ref.extractall("../data/output/") diff --git a/source/model.py b/source/model.py new file mode 100644 index 0000000..a0e44d7 --- /dev/null +++ b/source/model.py @@ -0,0 +1,89 @@ +import luigi +import joblib +import pandas as pd +import scipy.sparse +from data import ExtractRawData +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.neural_network import MLPClassifier + + +class Preprocessing(luigi.Task): + """ + NLP Preprocessing + """ + + def requires(self): + return ExtractRawData() + + def output(self): + return { + "X": luigi.LocalTarget("../data/output/preprocessing.npz"), + "vectorizer": luigi.LocalTarget("../data/output/preprocessing.joblib"), + } + + def run(self): + dataset = self.input() + train = pd.read_csv(dataset["train"].path, sep="\t") + corpus = train["Phrase"] + vectorizer = CountVectorizer( + lowercase=True, ngram_range=(1, 2), max_features=10_000 + ) + X = vectorizer.fit_transform(corpus) + # storing results + scipy.sparse.save_npz(self.output()["X"].path, X) + joblib.dump(vectorizer, self.output()["vectorizer"].path) + + +class TrainModel(luigi.Task): + """ + Train model + """ + + def requires(self): + return { + "data": ExtractRawData(), + "preprocessing": Preprocessing(), + } + + def output(self): + return luigi.LocalTarget("../data/output/model.joblib") + + def run(self): + _input = self.input() + X = scipy.sparse.load_npz(_input["preprocessing"]["X"].path) + y = pd.read_csv(_input["data"]["train"].path, sep="\t")["Sentiment"] + model = MLPClassifier( + max_iter=500, + hidden_layer_sizes=(512, 256), + early_stopping=True, + random_state=29, + verbose=True, + ) + model.fit(X, y) + joblib.dump(model, self.output().path) + + +class Predict(luigi.Task): + """ + Predict + """ + + def requires(self): + return { + "data": ExtractRawData(), + "preprocessing": Preprocessing(), + "model": TrainModel(), + } + + def output(self): + return luigi.LocalTarget("../data/output/submission.csv") + + def run(self): + _input = self.input() + model = joblib.load(_input["model"].path) + vectorizer = joblib.load(_input["preprocessing"]["vectorizer"].path) + test = pd.read_csv(_input["data"]["test"].path, sep="\t") + # Predicting + test_X = vectorizer.transform(test["Phrase"].fillna("")) + test["Sentiment"] = model.predict(test_X) + test[["PhraseId", "Sentiment"]].to_csv(self.output().path, index=False)