Skip to content

Commit

Permalink
Merge pull request #7 from realmCode/main
Browse files Browse the repository at this point in the history
UPDATE pipe.py, model.py, predict.py  AND pre processed files.
  • Loading branch information
Maurya69 authored Oct 29, 2024
2 parents d2fa159 + 729bd55 commit 54e12de
Show file tree
Hide file tree
Showing 10 changed files with 25,215 additions and 117 deletions.
Binary file added accuracy_graph.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added loss_graph.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added model.h5
Binary file not shown.
95 changes: 68 additions & 27 deletions model.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,83 @@
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, GlobalAveragePooling1D
from keras.utils import to_categorical, pad_sequences
from ast import literal_eval
from pipe import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import joblib

# Load your processed DataFrame (Assuming cleaned_reviews is in the DataFrame)
df = pd.read_csv('train.csv')

def plot_graphs(history:Sequential, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.savefig(string+".png")
plt.show()
def plot(history):
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

# Function to create and train the model you can adjust the arguments as per your requirement
def train(X_train, y_train, X_val, y_val, embedding_dim=100, batch_size=64, epochs=5):


def train(x_train, y_train, x_val, y_val, vocab_size = 70000, embedding_dim=100, epochs=5, max_length=350, plotg=True):
model = Sequential([
Embedding(vocab_size, embedding_dim, input_length=max_length),
GlobalAveragePooling1D(),
Dense(24, activation='relu'),
Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val, y_val))
if plotg:
plot(history)
return model

# Function to check the accuracy of the model
def check_accuracy(model, test_df):

return accuracy

def check_accuracy(model:Sequential, test_df:pd.DataFrame, preprocessor:NLPPreprocessor):
X_test = np.array(preprocessor.generate_word_embeddings(preprocessor.clean(test_df['review'])))
scores = model.evaluate(X_test, test_df['Sentiment'])
print("Test Score:", scores[0])
print("Test Accuracy:", scores[1])
return scores

# Prepare the data
df=pd.read_csv('processed_train.csv')
df.drop(columns=['reviews'], inplace=True) # Drop the original reviews

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df['embeddings'], df['sentiment'], test_size=0.2, random_state=42)

# Train the model
model = train(X_train, y_train, X_val, y_val)
test_df=pd.read_csv('test.csv')
# Load your processed DataFrame (Assuming cleaned_reviews is in the DataFrame)
df = pd.read_csv('processed_train.csv')

x = np.stack(df['embeddings'].apply(literal_eval))
y = np.stack(df['Sentiment'])

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
model = train(x_train, y_train, x_val, y_val)
model.save('model.h5')


test_df=pd.read_csv('test.csv')
# Evaluate on Test.csv
print(f"Accuracy on Test set: {check_accuracy(model, test_df)}")
# Save the trained model and tokenizer
joblib.dump(model, 'model.pkl')
preprocessor = joblib.load('pre_pipeline.pkl')

# check accuracy
check_accuracy(model, test_df, preprocessor)
# Test Score: 0.44428691267967224
# Test Accuracy: 0.8607199788093567 -> 86.4% test accuracy which is near to our val accuracy 88.76, no model overfeeding :)

################ OUT ####################################
# Epoch 1/5
# 625/625 [==============================] - 86s 135ms/step - loss: 0.4868 - accuracy: 0.7851 - val_loss: 0.3007 - val_accuracy: 0.8812
# Epoch 2/5
# 625/625 [==============================] - 79s 127ms/step - loss: 0.2252 - accuracy: 0.9162 - val_loss: 0.2670 - val_accuracy: 0.8946
# Epoch 3/5
# 625/625 [==============================] - 82s 131ms/step - loss: 0.1450 - accuracy: 0.9500 - val_loss: 0.2712 - val_accuracy: 0.8962
# Epoch 4/5
# 625/625 [==============================] - 83s 133ms/step - loss: 0.0966 - accuracy: 0.9693 - val_loss: 0.2963 - val_accuracy: 0.8908
# Epoch 5/5
# 625/625 [==============================] - 78s 125ms/step - loss: 0.0653 - accuracy: 0.9811 - val_loss: 0.3199 - val_accuracy: 0.8876
# 782/782 [==============================] - 6s 7ms/step - loss: 0.3961 - accuracy: 0.8641
# Test Score: 0.3961312770843506
# Test Accuracy: 0.8641200065612793

142 changes: 86 additions & 56 deletions pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,98 +5,128 @@
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import numpy as np
import pandas as pd
import joblib
# Ensure you have the NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
import re
try:
stopwords.words("english")
except:
nltk.download('stopwords')
nltk.download('wordnet')
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
TAG_RE = re.compile(r'<[^>]+>') # remove html tags

class RemoveTags(BaseEstimator, TransformerMixin):
def transform(self, X:pd.DataFrame):
return X.apply(lambda text:TAG_RE.sub('', text.lower())) #this is first step of pipline therefore it will also be lowercasing text for later lines.


def fit(self, X:pd.DataFrame, y=None):
return self

class RemoveSingleChar(BaseEstimator, TransformerMixin):
def transform(self, X:pd.DataFrame):
return X.apply(lambda text:re.sub(r"\s+[a-zA-Z]\s+", ' ', text))


def fit(self, X:pd.DataFrame, y=None):
return self

class RemovePunctuation(BaseEstimator, TransformerMixin):
# write actual logic for removing punctuation
def transform(self, X):
pass
def transform(self, X:pd.DataFrame):
return X.apply(lambda text:re.sub('[^a-zA-Z]', ' ', text))


def fit(self, X, y=None):
def fit(self, X:pd.DataFrame, y=None):
return self


class RemoveExtraSpaces(BaseEstimator, TransformerMixin):
# write actual logic for removing extra spaces
def transform(self, X):
pass
def transform(self, X:pd.DataFrame):
return X.apply(lambda text: re.sub(r'\s+', ' ', text))

def fit(self, X, y=None):
def fit(self, X:pd.DataFrame, y=None):
return self


class RemoveStopWords(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('english'))
# write actual logic for removing stop words
def transform(self, X):
pass
def fit(self, X, y=None):
def transform(self, X:pd.DataFrame):
return X.apply(lambda text: pattern.sub('', text))
def fit(self, X:pd.DataFrame, y=None):
return self


class LemmatizeText(BaseEstimator, TransformerMixin):
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
# write actual logic for lemmatizing text
def transform(self, X):
pass
def fit(self, X, y=None):

def transform(self, X:pd.DataFrame):
return X.apply(lambda text: ''.join([self.lemmatizer.lemmatize(word)+" " for word in text.split()]))

def fit(self, X:pd.DataFrame, y=None):
return self


class NLPPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self):
"""
here you must define your model as a parameter
for eg: self.embedding_model = TfidfVectorizer()
"""
pass
def transform(self, X):
return X # No additional transformation at this level

def fit(self, X, y=None):
def __init__(self, vocabsize=70000, max_seq_length=350):
self.max_seq_length = max_seq_length
self.embedding_model = Tokenizer(num_words = vocabsize, oov_token="<oov>")
self.pipeline = Pipeline(steps=[
('remove_tags', RemoveTags()),
('remove_punctuation', RemovePunctuation()),
('remove_single_char', RemoveSingleChar()),
('remove_extra_spaces', RemoveExtraSpaces()),
('remove_stop_words', RemoveStopWords()),
('lemmatize', LemmatizeText()),
])
def fit(self, texts: pd.DataFrame, y=None):
return self
# write logic to generate word embeddings, you can also change input arguments as per your requirement
def generate_word_embeddings(self, texts, vector_size=100, window=5, min_count=1, workers=4):
pass
# write logic to generate embedding for a single review

def generatetokens(self, texts: pd.DataFrame):
self.embedding_model.fit_on_texts(texts.tolist())
return

def generate_word_embeddings(self, texts: pd.DataFrame):
padedsequences = pad_sequences(self.embedding_model.texts_to_sequences(texts), maxlen=self.max_seq_length, padding='post', truncating='post')
return padedsequences

def single_review_embedding(self, text):
pass
padedsequences = pad_sequences(self.embedding_model.texts_to_sequences(text), maxlen=self.max_seq_length, padding='post', truncating='post')
return padedsequences

pipeline = Pipeline(steps=[
('remove_punctuation', RemovePunctuation()),
('remove_extra_spaces', RemoveExtraSpaces()),
('remove_stop_words', RemoveStopWords()),
('lemmatize', LemmatizeText()),
])
def clean(self, texts:pd.DataFrame):
return self.pipeline.transform(texts)


def save(self):
joblib.dump(self, 'pre_pipeline.pkl')


df = pd.read_csv('train.csv')
reviews = df['reviews']
# getting the cleaned reviews
cleaned_reviews = pipeline.transform(reviews)
def preprocess():
preprocessor = NLPPreprocessor()

# Creating a new list for embeddings
preprocessor = NLPPreprocessor()
df = pd.read_csv('train.csv')

# Generate embeddings for all cleaned reviews at once
embeddings = preprocessor.generate_word_embeddings(cleaned_reviews) # Generate embeddings for the entire list
# save necessary things for furthur usage.

# If needed, convert the embeddings to a list or DataFrame for easier handling
embeddings_list = embeddings.tolist() # Convert to list if embeddings are in a numpy array
df['review'] = preprocessor.clean(df['review'])

df['embeddings'] = embeddings_list
#generate this tokens only for train data review, for test it must not be used
preprocessor.generatetokens(df['review'])

df['embeddings'] = preprocessor.generate_word_embeddings(df['review']).tolist()

# print(f"vocab size = ", len(preprocessor.embedding_model.word_counts)+1)
# 66507 -> total vocab

# Save the processed DataFrame
pd.to_csv('processed_train.csv')
# Saving the pipeline
joblib.dump(pipeline, 'pre_pipeline.pkl')
df.to_csv("processed_train.csv", index=False)

preprocessor.save()

# preprocess()
# [Done] exited with code=0 in 59.544 seconds
Binary file added pre_pipeline.pkl
Binary file not shown.
Binary file added pre_pipeline_backup.pkl
Binary file not shown.
81 changes: 54 additions & 27 deletions predict.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,61 @@
import pandas as pd
import joblib
from keras.models import load_model
from pipe import NLPPreprocessor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, GlobalAveragePooling1D
from keras.utils import to_categorical, pad_sequences
from ast import literal_eval
from pipe import *
from model import *
# Load the trained model, preprocessing pipeline, and embedding generator
model_file = 'model.pkl'
pipeline_file = 'pre_pipeline.pkl'

# Load the model
model = joblib.load(model_file)

# Load the preprocessing pipeline
preprocessor_pipeline = joblib.load(pipeline_file)



# Get user input for the review
review = input("Enter the review to predict sentiment: ")

# Preprocess the review using the pipeline
processed_review = preprocessor_pipeline.transform([review])

# Generate embeddings using the embedding generator
embeddings = NLPPreprocessor().single_review_embedding(processed_review)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

# Make the prediction
prediction = model.predict(embeddings)
predicted_class = prediction.argmax(axis=-1) # Get the index of the class with the highest probability
# Load the trained model, preprocessing pipeline, and embedding generator
model_file = 'model.h5'
preprocessor = joblib.load('pre_pipeline.pkl')

# Interpret the prediction (adjust based on your labeling)
sentiment = "Positive" if predicted_class[0] == 1 else "Negative"

print(f"The predicted sentiment is: {sentiment}")
# Load the model
model = load_model(model_file)

# extra axis for sentiment relations
def interpret_sentiment(prob):
if prob <= 0.20:
return "Strongly Negative"
elif prob <= 0.40:
return "Negative"
elif prob <= 0.49:
return "Neutral / Slightly Negative"
elif prob == 0.50:
return "Neutral / Ambiguous"
elif prob <= 0.60:
return "Neutral / Slightly Positive"
elif prob <= 0.80:
return "Positive"
else:
return "Strongly Positive"

def predict_sentiment(review:str):
# Preprocess the review using the pipeline
a = {"a": [review]}
processed_review = preprocessor.clean(pd.DataFrame(a)['a'])
# Generate embeddings using the embedding generator
embeddings = preprocessor.single_review_embedding(processed_review)

# Make the prediction
prediction = model.predict(embeddings)
predicted_prob = prediction[0]
# Interpret the prediction (adjust based on your labeling)
sentiment = interpret_sentiment(predicted_prob)

print(f"The predicted sentiment is: {sentiment}")

predict_sentiment("Harry potter is magical movie, just amazing, i liked whole plot.")

# 1/1 [==============================] - ETA: 0s
# 1/1 [==============================] - 1s 910ms/step
# The predicted sentiment is: Strongly Positive
Loading

0 comments on commit 54e12de

Please sign in to comment.