-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier_model.py
69 lines (47 loc) · 2.12 KB
/
classifier_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 14 23:29:56 2020
@author: debanjalibiswas
Implementation of a classification model to distinguish between similar Languages
Classifier model: soft voting classifier on the ensemble of SVM and Naive Bayes classifiers
using n-gram (2-6) character level Tfidf feature extractor
"""
from utils import lang
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
def extract_features(max_features = 8000):
"""
To apply tf-idf vectorizer and split data to the test and train data.
max_features: size of feature set
"""
# Tfidf feature extractor on character level n-gram (2-6)
tfidf = TfidfVectorizer(ngram_range=(2,6), analyzer= 'char', max_features=max_features)
return tfidf
def train(X_train, y_train):
"""
Train or classifier model on training set.
X_train: input data
y_train: data labels
"""
classifier_NB = MultinomialNB() # Naive Bayes Classifier
classifier_SVM = SGDClassifier(loss = 'log') # SVM classifier using Stochastic Gradient Descent
# soft Voting Classifier on the ensemble of SVM and Naive Bayes
classifier = VotingClassifier(estimators=[('nb', classifier_NB), ('svm', classifier_SVM)], voting='soft')
# fit the model
classifier.fit(X_train, y_train)
return classifier
def predict(classifier, X_test, y_test):
"""
Predict the accuracy of our classifier model on the test set.
classifier: trained classifier
X_test: test set data
y_test: test set labels
"""
y_pred = classifier.predict(X_test) # predicting using our model
accuracy = accuracy_score(y_test, y_pred) # accuracy
confusion= confusion_matrix(y_test, y_pred, labels=lang) # confusion matrix
score_f1 = f1_score(y_test, y_pred, average='weighted') # f1 score
return accuracy, confusion, score_f1