Merge pull request #7 from realmCode/main

UPDATE pipe.py, model.py, predict.py AND pre processed files.
iiitv · Oct 29, 2024 · 54e12de · 54e12de
2 parents d2fa159 + 729bd55
commit 54e12de
Show file tree

Hide file tree

Showing 10 changed files with 25,215 additions and 117 deletions.
diff --git a/accuracy_graph.png b/accuracy_graph.png
diff --git a/loss_graph.png b/loss_graph.png
diff --git a/model.h5 b/model.h5
diff --git a/model.py b/model.py
@@ -1,42 +1,83 @@
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from keras.models import Sequential
+from keras.layers import LSTM, Dense, Embedding, GlobalAveragePooling1D
+from keras.utils import to_categorical, pad_sequences
+from ast import literal_eval
+from pipe import *
 import pandas as pd
 import numpy as np
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.utils import to_categorical
+import matplotlib.pyplot as plt
 import joblib
 
-# Load your processed DataFrame (Assuming cleaned_reviews is in the DataFrame)
-df = pd.read_csv('train.csv')
-
+def plot_graphs(history:Sequential, string):
+  plt.plot(history.history[string])
+  plt.plot(history.history['val_'+string])
+  plt.xlabel("Epochs")
+  plt.ylabel(string)
+  plt.legend([string, 'val_'+string])
+  plt.savefig(string+".png")
+  plt.show()
+def plot(history):
+    plot_graphs(history, "accuracy")
+    plot_graphs(history, "loss")
 
-# Function to create and train the model you can adjust the arguments as per your requirement
-def train(X_train, y_train, X_val, y_val, embedding_dim=100, batch_size=64, epochs=5):
 
-
+def train(x_train, y_train, x_val, y_val, vocab_size = 70000, embedding_dim=100, epochs=5, max_length=350, plotg=True):
+    model = Sequential([
+        Embedding(vocab_size, embedding_dim, input_length=max_length),
+        GlobalAveragePooling1D(),
+        Dense(24, activation='relu'),
+        Dense(1, activation='sigmoid')
+    ])
+    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
+    history = model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val, y_val))
+    if plotg:
+        plot(history)
     return model
+
 # Function to check the accuracy of the model
-def check_accuracy(model, test_df):
-
-    return accuracy
 
+def check_accuracy(model:Sequential, test_df:pd.DataFrame, preprocessor:NLPPreprocessor):
+    X_test = np.array(preprocessor.generate_word_embeddings(preprocessor.clean(test_df['review'])))
+    scores = model.evaluate(X_test, test_df['Sentiment'])
+    print("Test Score:", scores[0])
+    print("Test Accuracy:", scores[1])
+    return scores
 
-# Prepare the data
-df=pd.read_csv('processed_train.csv')
-df.drop(columns=['reviews'], inplace=True)  # Drop the original reviews
-
-# Split into training and validation sets
-X_train, X_val, y_train, y_val = train_test_split(df['embeddings'], df['sentiment'], test_size=0.2, random_state=42)
 
-# Train the model
-model = train(X_train, y_train, X_val, y_val)
-test_df=pd.read_csv('test.csv')
+# Load your processed DataFrame (Assuming cleaned_reviews is in the DataFrame)
+df = pd.read_csv('processed_train.csv')
+
+x = np.stack(df['embeddings'].apply(literal_eval))
+y = np.stack(df['Sentiment'])
+
+x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
+model = train(x_train, y_train, x_val, y_val)
+model.save('model.h5')
 
+
+test_df=pd.read_csv('test.csv')
 # Evaluate on Test.csv
-print(f"Accuracy on Test set: {check_accuracy(model, test_df)}")
-# Save the trained model and tokenizer
-joblib.dump(model, 'model.pkl')
+preprocessor = joblib.load('pre_pipeline.pkl')
+
+# check accuracy
+check_accuracy(model, test_df, preprocessor)
+# Test Score: 0.44428691267967224
+# Test Accuracy: 0.8607199788093567 -> 86.4% test accuracy which is near to our val accuracy 88.76, no model overfeeding :)
 
+################  OUT ####################################
+# Epoch 1/5
+# 625/625 [==============================] - 86s 135ms/step - loss: 0.4868 - accuracy: 0.7851 - val_loss: 0.3007 - val_accuracy: 0.8812
+# Epoch 2/5
+# 625/625 [==============================] - 79s 127ms/step - loss: 0.2252 - accuracy: 0.9162 - val_loss: 0.2670 - val_accuracy: 0.8946
+# Epoch 3/5
+# 625/625 [==============================] - 82s 131ms/step - loss: 0.1450 - accuracy: 0.9500 - val_loss: 0.2712 - val_accuracy: 0.8962
+# Epoch 4/5
+# 625/625 [==============================] - 83s 133ms/step - loss: 0.0966 - accuracy: 0.9693 - val_loss: 0.2963 - val_accuracy: 0.8908
+# Epoch 5/5
+# 625/625 [==============================] - 78s 125ms/step - loss: 0.0653 - accuracy: 0.9811 - val_loss: 0.3199 - val_accuracy: 0.8876
+# 782/782 [==============================] - 6s 7ms/step - loss: 0.3961 - accuracy: 0.8641
+# Test Score: 0.3961312770843506
+# Test Accuracy: 0.8641200065612793
 
diff --git a/pipe.py b/pipe.py
@@ -5,98 +5,128 @@
 from nltk.stem import WordNetLemmatizer
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.pipeline import Pipeline
-from gensim.models import Word2Vec
+from keras.preprocessing.text import Tokenizer
+from keras.utils import pad_sequences
+import numpy as np
 import pandas as pd
 import joblib
-# Ensure you have the NLTK resources
-nltk.download('stopwords')
-nltk.download('wordnet')
+import re
+try: 
+    stopwords.words("english")
+except:
+    nltk.download('stopwords')
+    nltk.download('wordnet')
+pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
+TAG_RE = re.compile(r'<[^>]+>') # remove html tags
+
+class RemoveTags(BaseEstimator, TransformerMixin):
+    def transform(self, X:pd.DataFrame):
+        return X.apply(lambda text:TAG_RE.sub('', text.lower()))  #this is first step of pipline therefore it will also be lowercasing text for later lines.
+
+
+    def fit(self, X:pd.DataFrame, y=None):
+        return self
 
+class RemoveSingleChar(BaseEstimator, TransformerMixin):
+    def transform(self, X:pd.DataFrame):
+        return X.apply(lambda text:re.sub(r"\s+[a-zA-Z]\s+", ' ', text))
+
 
+    def fit(self, X:pd.DataFrame, y=None):
+        return self
+
 class RemovePunctuation(BaseEstimator, TransformerMixin):
-# write actual logic for removing punctuation
-    def transform(self, X):
-        pass
+    def transform(self, X:pd.DataFrame):
+        return X.apply(lambda text:re.sub('[^a-zA-Z]', ' ', text))
+
 
-    def fit(self, X, y=None):
+    def fit(self, X:pd.DataFrame, y=None):
         return self
 
 
 class RemoveExtraSpaces(BaseEstimator, TransformerMixin):
-    # write actual logic for removing extra spaces
-    def transform(self, X):
-        pass
+    def transform(self, X:pd.DataFrame):
+        return X.apply(lambda text: re.sub(r'\s+', ' ', text))
 
-    def fit(self, X, y=None):
+    def fit(self, X:pd.DataFrame, y=None):
         return self
 
 
 class RemoveStopWords(BaseEstimator, TransformerMixin):
     def __init__(self):
         self.stop_words = set(stopwords.words('english'))
-# write actual logic for removing stop words
-    def transform(self, X):
-        pass
-    def fit(self, X, y=None):
+    def transform(self, X:pd.DataFrame):
+        return X.apply(lambda text: pattern.sub('', text))
+    def fit(self, X:pd.DataFrame, y=None):
         return self
 
 
 class LemmatizeText(BaseEstimator, TransformerMixin):
     def __init__(self):
         self.lemmatizer = WordNetLemmatizer()
-# write actual logic for lemmatizing text
-    def transform(self, X):
-        pass
-    def fit(self, X, y=None):
+
+    def transform(self, X:pd.DataFrame):
+        return X.apply(lambda text: ''.join([self.lemmatizer.lemmatize(word)+" " for word in text.split()]))
+
+    def fit(self, X:pd.DataFrame, y=None):
         return self
 
 
 class NLPPreprocessor(BaseEstimator, TransformerMixin):
-    def __init__(self):
-        """
-            here you must define your model as a parameter
-            for eg: self.embedding_model = TfidfVectorizer()
-        """
-        pass
-    def transform(self, X):
-        return X  # No additional transformation at this level
-
-    def fit(self, X, y=None):
+    def __init__(self, vocabsize=70000, max_seq_length=350):
+        self.max_seq_length = max_seq_length
+        self.embedding_model = Tokenizer(num_words = vocabsize, oov_token="<oov>") 
+        self.pipeline = Pipeline(steps=[
+                ('remove_tags', RemoveTags()),
+                ('remove_punctuation', RemovePunctuation()),
+                ('remove_single_char', RemoveSingleChar()),
+                ('remove_extra_spaces', RemoveExtraSpaces()),
+                ('remove_stop_words', RemoveStopWords()),
+                ('lemmatize', LemmatizeText()),
+            ])
+    def fit(self, texts: pd.DataFrame, y=None):
         return self
-# write logic to generate word embeddings, you can also change input arguments as per your requirement
-    def generate_word_embeddings(self, texts, vector_size=100, window=5, min_count=1, workers=4):
-        pass
-# write logic to generate embedding for a single review
+
+    def generatetokens(self, texts: pd.DataFrame):
+        self.embedding_model.fit_on_texts(texts.tolist())
+        return 
+
+    def generate_word_embeddings(self, texts: pd.DataFrame):
+        padedsequences = pad_sequences(self.embedding_model.texts_to_sequences(texts), maxlen=self.max_seq_length, padding='post', truncating='post')
+        return padedsequences
+
     def single_review_embedding(self, text):
-        pass
+        padedsequences = pad_sequences(self.embedding_model.texts_to_sequences(text), maxlen=self.max_seq_length, padding='post', truncating='post')
+        return padedsequences
 
-pipeline = Pipeline(steps=[
-    ('remove_punctuation', RemovePunctuation()),
-    ('remove_extra_spaces', RemoveExtraSpaces()),
-    ('remove_stop_words', RemoveStopWords()),
-    ('lemmatize', LemmatizeText()),
-])
+    def clean(self, texts:pd.DataFrame):
+        return self.pipeline.transform(texts)
+
+
+    def save(self):
+        joblib.dump(self, 'pre_pipeline.pkl')
+
 
-df = pd.read_csv('train.csv')
-reviews = df['reviews']
-# getting the cleaned reviews
-cleaned_reviews = pipeline.transform(reviews)
+def preprocess():
+    preprocessor = NLPPreprocessor()
 
-# Creating a new list for embeddings
-preprocessor = NLPPreprocessor()
+    df = pd.read_csv('train.csv')
 
-# Generate embeddings for all cleaned reviews at once
-embeddings = preprocessor.generate_word_embeddings(cleaned_reviews)  # Generate embeddings for the entire list
+    # save necessary things for furthur usage.
 
-# If needed, convert the embeddings to a list or DataFrame for easier handling
-embeddings_list = embeddings.tolist()  # Convert to list if embeddings are in a numpy array
+    df['review'] = preprocessor.clean(df['review'])
 
-df['embeddings'] = embeddings_list
+    #generate this tokens only for train data review, for test it must not be used
+    preprocessor.generatetokens(df['review'])
 
+    df['embeddings'] = preprocessor.generate_word_embeddings(df['review']).tolist()
+
+    # print(f"vocab size = ", len(preprocessor.embedding_model.word_counts)+1)
+    # 66507 -> total vocab
 
-# Save the processed DataFrame
-pd.to_csv('processed_train.csv')
-# Saving the pipeline
-joblib.dump(pipeline, 'pre_pipeline.pkl')
+    df.to_csv("processed_train.csv", index=False)
 
+    preprocessor.save()
 
+# preprocess()
+# [Done] exited with code=0 in 59.544 seconds
diff --git a/pre_pipeline.pkl b/pre_pipeline.pkl
diff --git a/pre_pipeline_backup.pkl b/pre_pipeline_backup.pkl
diff --git a/predict.py b/predict.py
@@ -1,34 +1,61 @@
 import pandas as pd
 import joblib
 from keras.models import load_model
+from pipe import NLPPreprocessor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from keras.models import Sequential
+from keras.layers import LSTM, Dense, Embedding, GlobalAveragePooling1D
+from keras.utils import to_categorical, pad_sequences
+from ast import literal_eval
 from pipe import *
-from model import *
-# Load the trained model, preprocessing pipeline, and embedding generator
-model_file = 'model.pkl' 
-pipeline_file = 'pre_pipeline.pkl'  
-
-# Load the model
-model = joblib.load(model_file)
-
-# Load the preprocessing pipeline
-preprocessor_pipeline = joblib.load(pipeline_file)
-
-
-
-# Get user input for the review
-review = input("Enter the review to predict sentiment: ")
-
-# Preprocess the review using the pipeline
-processed_review = preprocessor_pipeline.transform([review])
-
-# Generate embeddings using the embedding generator
-embeddings = NLPPreprocessor().single_review_embedding(processed_review)
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import joblib
 
-# Make the prediction
-prediction = model.predict(embeddings)
-predicted_class = prediction.argmax(axis=-1)  # Get the index of the class with the highest probability
+    # Load the trained model, preprocessing pipeline, and embedding generator
+model_file = 'model.h5' 
+preprocessor = joblib.load('pre_pipeline.pkl')
 
-# Interpret the prediction (adjust based on your labeling)
-sentiment = "Positive" if predicted_class[0] == 1 else "Negative"
 
-print(f"The predicted sentiment is: {sentiment}")
+# Load the model
+model = load_model(model_file)
+
+# extra axis for sentiment relations
+def interpret_sentiment(prob):
+    if prob <= 0.20:
+        return "Strongly Negative"
+    elif prob <= 0.40:
+        return "Negative"
+    elif prob <= 0.49:
+        return "Neutral / Slightly Negative"
+    elif prob == 0.50:
+        return "Neutral / Ambiguous"
+    elif prob <= 0.60:
+        return "Neutral / Slightly Positive"
+    elif prob <= 0.80:
+        return "Positive"
+    else:
+        return "Strongly Positive"
+
+def predict_sentiment(review:str):
+    # Preprocess the review using the pipeline
+    a = {"a": [review]}
+    processed_review = preprocessor.clean(pd.DataFrame(a)['a'])
+    # Generate embeddings using the embedding generator
+    embeddings = preprocessor.single_review_embedding(processed_review)
+
+    # Make the prediction
+    prediction = model.predict(embeddings)
+    predicted_prob = prediction[0] 
+    # Interpret the prediction (adjust based on your labeling)
+    sentiment = interpret_sentiment(predicted_prob)
+
+    print(f"The predicted sentiment is: {sentiment}")
+
+predict_sentiment("Harry potter is magical movie, just amazing, i liked whole plot.")
+
+# 1/1 [==============================] - ETA: 0s
+# 1/1 [==============================] - 1s 910ms/step
+# The predicted sentiment is: Strongly Positive