gaussian_nb.py

# -*- coding: utf-8 -*-
"""Untitled1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1NXUBPcSsYZNbNP23Grv8nMZHq2zHD5L-
"""

pip install datasets

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from datasets import load_dataset

#sadness (0), joy (1), love (2), anger (3), fear (4), surprise (5).

# Replace 'emotion' with the name of the emotion dataset you want to load
dataset_name = 'emotion'

# Load the emotion dataset
emotion_dataset = load_dataset(dataset_name)

# Access the train split of the dataset
train_dataset = emotion_dataset['train']
test_dataset = emotion_dataset['test']

import numpy as np

class NaiveBayes:
    def __init__(self):
        self.prior = None
        self.mean = None
        self.variance = None

    def fit(self, X, y, alpha=5.0):
        # Calculate prior probabilities
        num_samples, num_features = X.shape
        self.prior = {}
        classes = np.unique(y)
        for cls in classes:
            self.prior[cls] = np.log((np.sum(y == cls) + alpha) / (num_samples + alpha * len(classes)))

        # Calculate mean and variance for each class and feature
        self.mean = {}
        self.variance = {}
        for cls in classes:
            self.mean[cls] = np.mean(X[y == cls], axis=0)
            self.variance[cls] = np.var(X[y == cls], axis=0)

    def gaussian_likelihood(self, x, mean, var):
          eps = 1e-9  # Small value to avoid division by zero
          exponent = -((x - mean) ** 2) / (var + eps)
          return -0.5 * np.sum(np.log(2 * np.pi * var + eps) + exponent)


    def predict(self, X):
        predictions = []
        classes = list(self.prior.keys())
        for sample in X:
            likelihoods = []
            for cls in classes:
                prior = self.prior[cls]
                mean = self.mean[cls]
                variance = self.variance[cls]
                likelihood = 0

                for i, feature in enumerate(sample):
                  likelihood += self.gaussian_likelihood(
                        feature, mean[i], variance[i]
                    )


                # Calculate log likelihood of the sample for each class
                # likelihood = np.sum(self.gaussian_likelihood(sample, mean, variance))
                likelihoods.append(prior + likelihood)

            # Predict the class with the highest likelihood
            predictions.append(classes[np.argmax(likelihoods)])

        return predictions

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=['english'], ngram_range=(1, 1))

corpus = np.array(train_dataset['text'])
counts = vectorizer.fit_transform(corpus)
X_train = np.array(counts.todense())
# X_train = np.asarray(counts)
print(X_train.shape)
y_train = np.array(train_dataset['label'])

corpus = np.array(test_dataset['text'])
counts = (vectorizer.transform(corpus))
# X_test = np.array(counts)
X_test = np.array(counts.todense())
y_test = np.array(test_dataset['label'])

# Create and fit the classifier

alpha = [1, 3, 4,5, 6, 7, 8]


for i in alpha:
    classifier = NaiveBayes()
    classifier.fit(X_train, y_train, i)

    # Make predictions
    predictions = classifier.predict(X_test)

    # print("Predictions:", predictions)

    print(np.mean(predictions == y_test))