-
Notifications
You must be signed in to change notification settings - Fork 0
/
gaussian_nb.py
118 lines (85 loc) · 3.32 KB
/
gaussian_nb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# -*- coding: utf-8 -*-
"""Untitled1.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1NXUBPcSsYZNbNP23Grv8nMZHq2zHD5L-
"""
pip install datasets
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datasets import load_dataset
#sadness (0), joy (1), love (2), anger (3), fear (4), surprise (5).
# Replace 'emotion' with the name of the emotion dataset you want to load
dataset_name = 'emotion'
# Load the emotion dataset
emotion_dataset = load_dataset(dataset_name)
# Access the train split of the dataset
train_dataset = emotion_dataset['train']
test_dataset = emotion_dataset['test']
import numpy as np
class NaiveBayes:
def __init__(self):
self.prior = None
self.mean = None
self.variance = None
def fit(self, X, y, alpha=5.0):
# Calculate prior probabilities
num_samples, num_features = X.shape
self.prior = {}
classes = np.unique(y)
for cls in classes:
self.prior[cls] = np.log((np.sum(y == cls) + alpha) / (num_samples + alpha * len(classes)))
# Calculate mean and variance for each class and feature
self.mean = {}
self.variance = {}
for cls in classes:
self.mean[cls] = np.mean(X[y == cls], axis=0)
self.variance[cls] = np.var(X[y == cls], axis=0)
def gaussian_likelihood(self, x, mean, var):
eps = 1e-9 # Small value to avoid division by zero
exponent = -((x - mean) ** 2) / (var + eps)
return -0.5 * np.sum(np.log(2 * np.pi * var + eps) + exponent)
def predict(self, X):
predictions = []
classes = list(self.prior.keys())
for sample in X:
likelihoods = []
for cls in classes:
prior = self.prior[cls]
mean = self.mean[cls]
variance = self.variance[cls]
likelihood = 0
for i, feature in enumerate(sample):
likelihood += self.gaussian_likelihood(
feature, mean[i], variance[i]
)
# Calculate log likelihood of the sample for each class
# likelihood = np.sum(self.gaussian_likelihood(sample, mean, variance))
likelihoods.append(prior + likelihood)
# Predict the class with the highest likelihood
predictions.append(classes[np.argmax(likelihoods)])
return predictions
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words=['english'], ngram_range=(1, 1))
corpus = np.array(train_dataset['text'])
counts = vectorizer.fit_transform(corpus)
X_train = np.array(counts.todense())
# X_train = np.asarray(counts)
print(X_train.shape)
y_train = np.array(train_dataset['label'])
corpus = np.array(test_dataset['text'])
counts = (vectorizer.transform(corpus))
# X_test = np.array(counts)
X_test = np.array(counts.todense())
y_test = np.array(test_dataset['label'])
# Create and fit the classifier
alpha = [1, 3, 4,5, 6, 7, 8]
for i in alpha:
classifier = NaiveBayes()
classifier.fit(X_train, y_train, i)
# Make predictions
predictions = classifier.predict(X_test)
# print("Predictions:", predictions)
print(np.mean(predictions == y_test))