-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify_comments.py
42 lines (31 loc) · 1.28 KB
/
classify_comments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
import pickle as pkl
# Download data to Train the model on
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()
# Extract text from the data
posts_text = [post.text for post in posts]
# divide train and test in 80 20
train_text = posts_text[:int(len(posts_text)*0.8)]
test_text = posts_text[int(len(posts_text)*0.2):]
# Get TFIDF features
vectorizer = TfidfVectorizer(ngram_range=(1,3),
min_df=0.001,
max_df=0.7,
analyzer='word')
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
y = [post.get('class') for post in posts]
y_train = y[:int(len(posts_text)*0.8)]
y_test = y[int(len(posts_text)*0.2):]
# Fitting Gradient Boosting classifier to the Training set
gb = GradientBoostingClassifier(n_estimators = 400, random_state=0)
# Can be improved with Cross Validation
gb.fit(X_train, y_train)
# Save trained Gradient Boosting model to avoid training and runtime
filename_model = "question_classifier"
filename_vectorizer = "text_vectorizer"
pkl.dump(gb, open(filename_model,'wb'))
pkl.dump(vectorizer, open(filename_vectorizer, 'rb'))