forked from Maurya69/Movie_Review-Contribute-a-thon-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from realmCode/main
UPDATE pipe.py, model.py, predict.py AND pre processed files.
- Loading branch information
Showing
10 changed files
with
25,215 additions
and
117 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,83 @@ | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import accuracy_score | ||
from keras.models import Sequential | ||
from keras.layers import LSTM, Dense, Embedding, GlobalAveragePooling1D | ||
from keras.utils import to_categorical, pad_sequences | ||
from ast import literal_eval | ||
from pipe import * | ||
import pandas as pd | ||
import numpy as np | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import LabelEncoder | ||
from tensorflow.keras.models import Sequential | ||
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D | ||
from tensorflow.keras.preprocessing.sequence import pad_sequences | ||
from tensorflow.keras.utils import to_categorical | ||
import matplotlib.pyplot as plt | ||
import joblib | ||
|
||
# Load your processed DataFrame (Assuming cleaned_reviews is in the DataFrame) | ||
df = pd.read_csv('train.csv') | ||
|
||
def plot_graphs(history:Sequential, string): | ||
plt.plot(history.history[string]) | ||
plt.plot(history.history['val_'+string]) | ||
plt.xlabel("Epochs") | ||
plt.ylabel(string) | ||
plt.legend([string, 'val_'+string]) | ||
plt.savefig(string+".png") | ||
plt.show() | ||
def plot(history): | ||
plot_graphs(history, "accuracy") | ||
plot_graphs(history, "loss") | ||
|
||
# Function to create and train the model you can adjust the arguments as per your requirement | ||
def train(X_train, y_train, X_val, y_val, embedding_dim=100, batch_size=64, epochs=5): | ||
|
||
|
||
def train(x_train, y_train, x_val, y_val, vocab_size = 70000, embedding_dim=100, epochs=5, max_length=350, plotg=True): | ||
model = Sequential([ | ||
Embedding(vocab_size, embedding_dim, input_length=max_length), | ||
GlobalAveragePooling1D(), | ||
Dense(24, activation='relu'), | ||
Dense(1, activation='sigmoid') | ||
]) | ||
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) | ||
history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val, y_val)) | ||
if plotg: | ||
plot(history) | ||
return model | ||
|
||
# Function to check the accuracy of the model | ||
def check_accuracy(model, test_df): | ||
|
||
return accuracy | ||
|
||
def check_accuracy(model:Sequential, test_df:pd.DataFrame, preprocessor:NLPPreprocessor): | ||
X_test = np.array(preprocessor.generate_word_embeddings(preprocessor.clean(test_df['review']))) | ||
scores = model.evaluate(X_test, test_df['Sentiment']) | ||
print("Test Score:", scores[0]) | ||
print("Test Accuracy:", scores[1]) | ||
return scores | ||
|
||
# Prepare the data | ||
df=pd.read_csv('processed_train.csv') | ||
df.drop(columns=['reviews'], inplace=True) # Drop the original reviews | ||
|
||
# Split into training and validation sets | ||
X_train, X_val, y_train, y_val = train_test_split(df['embeddings'], df['sentiment'], test_size=0.2, random_state=42) | ||
|
||
# Train the model | ||
model = train(X_train, y_train, X_val, y_val) | ||
test_df=pd.read_csv('test.csv') | ||
# Load your processed DataFrame (Assuming cleaned_reviews is in the DataFrame) | ||
df = pd.read_csv('processed_train.csv') | ||
|
||
x = np.stack(df['embeddings'].apply(literal_eval)) | ||
y = np.stack(df['Sentiment']) | ||
|
||
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42) | ||
model = train(x_train, y_train, x_val, y_val) | ||
model.save('model.h5') | ||
|
||
|
||
test_df=pd.read_csv('test.csv') | ||
# Evaluate on Test.csv | ||
print(f"Accuracy on Test set: {check_accuracy(model, test_df)}") | ||
# Save the trained model and tokenizer | ||
joblib.dump(model, 'model.pkl') | ||
preprocessor = joblib.load('pre_pipeline.pkl') | ||
|
||
# check accuracy | ||
check_accuracy(model, test_df, preprocessor) | ||
# Test Score: 0.44428691267967224 | ||
# Test Accuracy: 0.8607199788093567 -> 86.4% test accuracy which is near to our val accuracy 88.76, no model overfeeding :) | ||
|
||
################ OUT #################################### | ||
# Epoch 1/5 | ||
# 625/625 [==============================] - 86s 135ms/step - loss: 0.4868 - accuracy: 0.7851 - val_loss: 0.3007 - val_accuracy: 0.8812 | ||
# Epoch 2/5 | ||
# 625/625 [==============================] - 79s 127ms/step - loss: 0.2252 - accuracy: 0.9162 - val_loss: 0.2670 - val_accuracy: 0.8946 | ||
# Epoch 3/5 | ||
# 625/625 [==============================] - 82s 131ms/step - loss: 0.1450 - accuracy: 0.9500 - val_loss: 0.2712 - val_accuracy: 0.8962 | ||
# Epoch 4/5 | ||
# 625/625 [==============================] - 83s 133ms/step - loss: 0.0966 - accuracy: 0.9693 - val_loss: 0.2963 - val_accuracy: 0.8908 | ||
# Epoch 5/5 | ||
# 625/625 [==============================] - 78s 125ms/step - loss: 0.0653 - accuracy: 0.9811 - val_loss: 0.3199 - val_accuracy: 0.8876 | ||
# 782/782 [==============================] - 6s 7ms/step - loss: 0.3961 - accuracy: 0.8641 | ||
# Test Score: 0.3961312770843506 | ||
# Test Accuracy: 0.8641200065612793 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,34 +1,61 @@ | ||
import pandas as pd | ||
import joblib | ||
from keras.models import load_model | ||
from pipe import NLPPreprocessor | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import accuracy_score | ||
from keras.models import Sequential | ||
from keras.layers import LSTM, Dense, Embedding, GlobalAveragePooling1D | ||
from keras.utils import to_categorical, pad_sequences | ||
from ast import literal_eval | ||
from pipe import * | ||
from model import * | ||
# Load the trained model, preprocessing pipeline, and embedding generator | ||
model_file = 'model.pkl' | ||
pipeline_file = 'pre_pipeline.pkl' | ||
|
||
# Load the model | ||
model = joblib.load(model_file) | ||
|
||
# Load the preprocessing pipeline | ||
preprocessor_pipeline = joblib.load(pipeline_file) | ||
|
||
|
||
|
||
# Get user input for the review | ||
review = input("Enter the review to predict sentiment: ") | ||
|
||
# Preprocess the review using the pipeline | ||
processed_review = preprocessor_pipeline.transform([review]) | ||
|
||
# Generate embeddings using the embedding generator | ||
embeddings = NLPPreprocessor().single_review_embedding(processed_review) | ||
import pandas as pd | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import joblib | ||
|
||
# Make the prediction | ||
prediction = model.predict(embeddings) | ||
predicted_class = prediction.argmax(axis=-1) # Get the index of the class with the highest probability | ||
# Load the trained model, preprocessing pipeline, and embedding generator | ||
model_file = 'model.h5' | ||
preprocessor = joblib.load('pre_pipeline.pkl') | ||
|
||
# Interpret the prediction (adjust based on your labeling) | ||
sentiment = "Positive" if predicted_class[0] == 1 else "Negative" | ||
|
||
print(f"The predicted sentiment is: {sentiment}") | ||
# Load the model | ||
model = load_model(model_file) | ||
|
||
# extra axis for sentiment relations | ||
def interpret_sentiment(prob): | ||
if prob <= 0.20: | ||
return "Strongly Negative" | ||
elif prob <= 0.40: | ||
return "Negative" | ||
elif prob <= 0.49: | ||
return "Neutral / Slightly Negative" | ||
elif prob == 0.50: | ||
return "Neutral / Ambiguous" | ||
elif prob <= 0.60: | ||
return "Neutral / Slightly Positive" | ||
elif prob <= 0.80: | ||
return "Positive" | ||
else: | ||
return "Strongly Positive" | ||
|
||
def predict_sentiment(review:str): | ||
# Preprocess the review using the pipeline | ||
a = {"a": [review]} | ||
processed_review = preprocessor.clean(pd.DataFrame(a)['a']) | ||
# Generate embeddings using the embedding generator | ||
embeddings = preprocessor.single_review_embedding(processed_review) | ||
|
||
# Make the prediction | ||
prediction = model.predict(embeddings) | ||
predicted_prob = prediction[0] | ||
# Interpret the prediction (adjust based on your labeling) | ||
sentiment = interpret_sentiment(predicted_prob) | ||
|
||
print(f"The predicted sentiment is: {sentiment}") | ||
|
||
predict_sentiment("Harry potter is magical movie, just amazing, i liked whole plot.") | ||
|
||
# 1/1 [==============================] - ETA: 0s | ||
# 1/1 [==============================] - 1s 910ms/step | ||
# The predicted sentiment is: Strongly Positive |
Oops, something went wrong.