-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaiveBayes.py
84 lines (65 loc) · 3.38 KB
/
naiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
data = pd.read_csv("classificate_model - Copy.csv")
#print(data)
from sklearn import model_selection, preprocessing
from sklearn.model_selection import train_test_split
#split the dataset intor training and validation datasets
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, data.Classification, test_size=0.2, random_state=42,stratify=data["Classification"])
#print(x_train)
#print(y_test)
#print("x_train.head() -------------------------\n", x_train.head())
#print("y_train.head() -------------------------\n", y_train.head())
#print("x_test.head() -------------------------\n", x_test.head())
#print("y_test.head() -------------------------\n",y_test.head())
#print(x_train.shape, y_train.shape)
#print(x_test.shape, y_test.shape)
from sklearn.feature_extraction.text import TfidfVectorizer
##TfidfVectorizer = TfidfVectorizer(analyzer='word', ngram_range=(2,3))
TfidfVectorizer = TfidfVectorizer()
x_train = TfidfVectorizer.fit_transform(x_train.Words)
x_test = TfidfVectorizer.fit_transform(x_test.Words)
#print("x_train.shape -------------------------\n",x_train.shape)
#print("x_test.shape -------------------------\n",x_test.shape)
#print("x_train-------------------------\n",x_train)
#print("x_test -------------------------\n",x_test)
##----------------------------------------------Text Classification [Naive Bayes Classifier]----------------------------------------------
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
classifer = MultinomialNB()
alpha_ranges = {"alpha":[10**-2,10**-1,10**0,10**1,10**2]} #find out which is the best alpha
grid_search = GridSearchCV(classifer, param_grid = alpha_ranges, scoring="accuracy", cv=3,return_train_score=True) #find out what cv=3 means
grid_search.fit(x_train,y_train)
#print("grid_search.fit(x_train,y_train)", grid_search.fit(x_train,y_train))
alpha = [10**-2,10**-1,10**0,10**1,10**2]
train_accuracy = grid_search.cv_results_['mean_train_score']
train_std = grid_search.cv_results_['std_train_score']
test_accuracy = grid_search.cv_results_['mean_test_score']
test_std = grid_search.cv_results_['std_test_score' ]
'''import matplotlib.pyplot as plt
#how to get validation curve https://www.geeksforgeeks.org/validation-curve/
# Plot mean accuracy scores for training and testing scores
plt.plot(alpha, train_accuracy,
label = "Training Score", color = 'b')
plt.plot(alpha, test_accuracy,
label = "Cross Validation Score", color = 'g')
# Creating the plot
plt.title("Validation Curve with Naive Bayes Classifier")
plt.xlabel("Alpha_range")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.legend(loc = 'best')
plt.show()'''
##---------------------------------------------- [Naive Bayes Classifier] prediction----------------------------------------------
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
le_train = LabelEncoder()
y_train=le_train.fit_transform(y_train)
print("y_train", y_train)
le_test= LabelEncoder()
y_test=le_test.fit_transform(y_test)
print("y_test", y_test)
#print("grid_search.best_estimator_", grid_search.best_estimator_)
mNB = MultinomialNB(alpha=0.1) #classifier based on print("grid_search.best_estimator_", grid_search.best_estimator_) = (alpha=0.1)
predict = mNB.fit(x_train, y_train).predict(x_test)
#print("accuracy is : ", accuracy_score(valid_y, predict))