-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHotel_Rating_Prediction.py
476 lines (383 loc) · 16.7 KB
/
Hotel_Rating_Prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
# -*- coding: utf-8 -*-
"""hotel-rating-prediction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/137HuTQ6MoX4iJIozu-WwEcCVbO_uikag
"""
# Commented out IPython magic to ensure Python compatibility.
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# data visualization
import matplotlib.pylab as plt
# %matplotlib inline
from matplotlib.pylab import rcParams
import seaborn as sns
# text processing library
import spacy
import re
from gensim import corpora, models, similarities
# model library
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
import scipy as scipy
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVR
#classification
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_numeric
from gensim.parsing.preprocessing import strip_multiple_whitespaces, strip_non_alphanum, remove_stopwords, strip_short
import simplejson as json
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from nltk.corpus import stopwords
from collections import Counter
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from scipy.sparse import csr_matrix
import re
import reverse_geocoder as rg
from geopy.geocoders import Nominatim
from opencage.geocoder import OpenCageGeocode
data = pd.read_csv("Hotel_Reviews.csv")
data = data.dropna()
data.head()
for columns in data.columns:
print(columns)
#create City and state columns from lat and long
def createCityState(df):
latlng = list(zip(df.lat, df.lng))
results = rg.search(latlng)
rows = []
for idx, city in enumerate(latlng):
write_row = []
lat = city[0]
lon = city[1]
gdata = results[idx]
rows.append([gdata['name'],gdata['admin1'],gdata['admin2']])
city_df = pd.DataFrame(rows, columns =['city_1', 'state', 'city_2'])
new_df = pd.concat([city_df,df],axis=1)
new_df = new_df.dropna()
return new_df
geolocator = Nominatim(user_agent="pattern")
key="72da58f1121e40cbb545966d98af58c6"
geocoder = OpenCageGeocode(key)
# for getting city, country and PostCode based on latitude and longitude given in the data
def get_address(df):
null_lat_lng = df[df.isnull().any(axis=1)]['Hotel_Address'].unique() #if there are any null longitude and latitude value then get it's longitude and latitude based on hotel address
for address in null_lat_lng:
results = geocoder.geocode(str(address))
df.loc[df.Hotel_Address == address, 'lat'] = results[0]['geometry']['lat']
df.loc[df.Hotel_Address == address, 'lng'] = results[0]['geometry']['lng']
# get unique hotel address only and get city, country, PostCode of those hotels only using API
get_long_lat = df.Hotel_Address.unique()
df["city"] = ""
df["country"] = ""
df["zip"] = ""
for i in range(0,len(get_long_lat)):
que = df.loc[df["Hotel_Address"] == get_long_lat[i]].iloc[0]
latitude = que.lat
longnitude = que.lng
string = str(latitude)+", "+str(longnitude)
location = geolocator.reverse(string)
df.loc[df.Hotel_Address == get_long_lat[i], 'city'] = location.raw["address"]["city"]
df.loc[df.Hotel_Address == get_long_lat[i], 'country'] = location.raw["address"]["country"]
df.loc[df.Hotel_Address == get_long_lat[i], 'zip'] = location.raw["address"]["postcode"]
return df
def split_train_to_test_validate(labeled_data):
# split train and test data
train_labeled_data, test_labeled_data = train_test_split(labeled_data, test_size=0.17)
testing_column = pd.DataFrame(columns=['rating', 'Reviewer_Score'])
testing_column['rating'] = test_labeled_data['rating']
testing_column['Reviewer_Score'] = test_labeled_data['Reviewer_Score']
testing_column.to_csv('test_label.csv')
# removed rating and Reviewer_Score coulmn from the test_data file
test_labeled_data = test_labeled_data.drop(['rating'],axis = 1)
test_labeled_data = test_labeled_data.drop(['Reviewer_Score'],axis = 1)
test_labeled_data.to_csv('test_data.csv')
# again split train and validate data
train_labeled_data1, validate_labeled_data = train_test_split(train_labeled_data, test_size=0.17)
validate_column = pd.DataFrame(columns=['rating', 'Reviewer_Score'])
validate_column['rating'] = validate_labeled_data['rating']
validate_column['Reviewer_Score'] = validate_labeled_data['Reviewer_Score']
validate_column.to_csv('validate_label.csv')
# removed rating and Reviewer_Score coulmn from the validate_data file
validate_labeled_data = validate_labeled_data.drop(['rating'],axis = 1)
validate_labeled_data = validate_labeled_data.drop(['Reviewer_Score'],axis = 1)
validate_labeled_data.to_csv('validate_data.csv')
# created new train file
train_labeled_data1.to_csv('train_data.csv')
data = createCityState(data)
review_data = data[['Hotel_Name', 'Positive_Review', 'Negative_Review', 'Average_Score', 'Reviewer_Score','Reviewer_Nationality','Tags','lat','lng','city_1','state']].copy()
review_data.head()
# combine negative and positive review
review_data['reviews'] = review_data[['Positive_Review', 'Negative_Review']].apply(lambda x: ' '.join(x), axis = 1)
# checking distribution of review score
review_data.Reviewer_Score.value_counts().plot(kind='bar', title='Count of Reviews', figsize = (15, 4), alpha = 0.8)
# Rounding the Review Score to nearest integer
review_data['round_review_score'] = review_data.Reviewer_Score.apply(lambda x: np.ceil(x))
# Selecting subset of data for speedup the computation.
reviews_df = review_data.sample(frac = 0.1, replace = False, random_state=45)
reviews_df.head()
nlp = spacy.load('en')
f1 = lambda x: str(x)
f2 = lambda x: x.lower()
f3 = lambda x: re.sub('[^\s][\]+[^\s]*', "", x)
f4 = lambda x: re.sub(r'\([^)]*\)', '', x)
text_regexes = lambda x: f4(f3(f2(f1(x))))
text_process_operators_list = [
text_regexes,
strip_tags,
strip_punctuation,
strip_non_alphanum,
strip_numeric,
remove_stopwords,
strip_multiple_whitespaces
]
def text_preprocess(docs, logging=True):
docs = [
preprocess_string(
text,
text_process_operators_list
) for text in docs
]
texts_out = []
for doc in docs:
doc = nlp((" ".join(doc)),
disable=['ner',
'tagger',
'textcat',])
texts_out.append([tok.lemma_ for tok in doc if tok.lemma_ != '-PRON-'])
return pd.Series(texts_out)
text_preprocess(reviews_df.reviews.iloc[10:15])
# Commented out IPython magic to ensure Python compatibility.
# %time train_corpus = text_preprocess(reviews_df.reviews)
# create ngrams
ngram_phraser= models.Phrases(train_corpus, threshold=1)
ngram = models.phrases.Phraser(ngram_phraser_1)
# apply n-gram model to corpus
texts_1 = [ngram[token] for token in train_corpus]
# adding it to dataframe
texts_1 = [' '.join(text) for text in texts_1]
reviews_df['ngram'] = texts_1
reviews_df.head()
def createLabelsFromReviewPoints(df): #this function creates a new column which will be our classification label like low,medium high
df['class']= df.apply (lambda row: label_reviews(row), axis=1)
return df
def label_reviews(row):
review = row['Reviewer_Score']
if(review <=5 and review >= 0):
return 1
if(review <=7 and review > 5):
return 2
if(review <=8 and review > 7):
return 3
if(review <= 9 and review > 8):
return 4
if(review > 9):
return 5
else:
return 0
reviews_df = createLabelsFromReviewPoints(reviews_df)
# Dividing data by class
class_count = reviews_df['class'].value_counts().values
df_class_5 = reviews_df[reviews_df['class'] == 5.0]
df_class_4 = reviews_df[reviews_df['class'] == 4.0]
df_class_3 = reviews_df[reviews_df['class'] == 3.0]
df_class_2 = reviews_df[reviews_df['class'] == 2.0]
df_class_1 = reviews_df[reviews_df['class'] == 1.0]
df_class_5 = df_class_5.sample(class_count[0], replace=True)
df_class_4 = df_class_4.sample(class_count[0], replace=True)
df_class_3 = df_class_3.sample(class_count[0], replace=True)
df_class_2 = df_class_2.sample(class_count[0], replace=True)
df_class_1 = df_class_1.sample(class_count[0], replace=True)
# concatenate individual datafram
df_train = pd.concat([df_class_5, df_class_4, df_class_3, df_class_2, df_class_1], axis=0)
# df_train['class'].value_counts().plot(kind='bar', title='Before sampling Review distribution', ax = ax2)
# represent features in countvectorizer for ngram
vectorizer = CountVectorizer()
vectorizer.fit(df_train_oversampled.ngram)
# split into test and train sets
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(df_train, df_train['class'], test_size=0.4)
def onehot(train,test,col_name):
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train_1[[col_name]])
train_enc_df = pd.DataFrame (encoder.transform(X_train_1[[col_name]]).toarray())
train_enc_df.columns = encoder.get_feature_names([col_name])
train_enc_df = train_enc_df.drop([train_enc_df.columns[0]],axis=1)
test_enc_df = pd.DataFrame (encoder.transform(X_test_1[[col_name]]).toarray())
test_enc_df.columns = encoder.get_feature_names([col_name])
test_enc_df = test_enc_df.drop([test_enc_df.columns[0]],axis=1)
return train_enc_df, test_enc_df
def Build_matrix_idf_normalize(df,voc):
'''Step3 -Build the Matrix with filter < 3 letters'''
mat = build_matrix(df,voc)
csr_info(mat)
'''Step4 - Normalize the Matrix without inverse document frequency'''
# mat2 = csr_idf(mat, copy=True)
# mat3 = csr_l2normalize(mat, copy=True)
return mat
#Create a sparse matrix
def build_matrix(df,idx):
r""" Build sparse matrix from a list of documents,
each of which is a list of word/terms in the document.
"""
nrows = df.shape[0]
ncols = len(idx)
nnz = 0
for index, row in df.iterrows():
rowValue = row['Tags'].strip("'<>() ").replace('\'', '\"')
rowValueToList = json.loads(rowValue)
tagsList = []
for tags in rowValueToList:
tags = tags.strip()
tagsList += tags
nnz += len(set(tagsList))
# set up memory
ind = np.zeros(nnz, dtype=np.int)
val = np.zeros(nnz, dtype=np.double)
ptr = np.zeros(nrows+1, dtype=np.int)
i = 0 # document ID / row counter
n = 0 # non-zero counter
# transfer values
for index, row in df.iterrows():
rowValue = row['Tags'].strip("'<>() ").replace('\'', '\"')
rowValueToList = json.loads(rowValue)
tagsList = []
for tags in rowValueToList:
tags = tags.strip()
for tag in tags.split():
tag = porter.stem(tag) #Stem the tag
if tag in idx: #Remove the stopwords
if len(tag) > 2:
tagsList.append(tag)
cnt = Counter(tagsList)
keys = list(k for k,_ in cnt.most_common())
l = len(keys)
for j,k in enumerate(keys):
if(k in idx):
ind[j+n] = idx[k]
val[j+n] = cnt[k]
# else:
# print("Vocabulary Not Found",k)
ptr[i+1] = ptr[i] + l
n += l
i += 1
mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
mat.sort_indices()
return mat
def csr_info(mat, name="", non_empy=False):
r""" Print out info about this CSR matrix. If non_empy,
report number of non-empty rows and cols as well
"""
if non_empy:
print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
name, mat.shape[0],
sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0
for i in range(mat.shape[0])),
mat.shape[1], len(np.unique(mat.indices)),
len(mat.data)))
else:
print( "%s [nrows %d, ncols %d, nnz %d]" % (name,
mat.shape[0], mat.shape[1], len(mat.data)) )
def convertTagsToFeatures(df):
features = {}
tid = 0
for index, row in df.iterrows():
rowValue = row['Tags'].strip("'<>() ").replace('\'', '\"')
rowValuve = re.sub(r"\b\d+\b", "", rowValue)
rowValueToList = json.loads(rowValue)
for tags in rowValueToList:
tags = tags.strip()
#tags = check(tags)
for tag in tags.split():
#tag = spell.correction(tag) #Correct the spellings
tag = porter.stem(tag) #Stem the tag
if not tag in stop_words: #Remove the stopwords
if len(tag) > 2: #Remove the words that that less than or equal to 2
if tag not in features:
features[tag] = tid
tid += 1
return features
#Features from tags
tagFeatures = convertTagsToFeatures(X_train_1)
tags_csr = Build_matrix_idf_normalize(X_train_1,tagFeatures)
tags_csr_test = Build_matrix_idf_normalize(X_test_1,tagFeatures)
#One hot encode
train_nation_df,test_nation_df = onehot(X_train_1,X_test_1,'Reviewer_Nationality')
train_hotel_df,test_hotel_df = onehot(X_train_1,X_test_1,'Hotel_Name')
train_city_df,test_city_df = onehot(X_train_1,X_test_1,'city_1')
train_state_df,test_state_df = onehot(X_train_1,X_test_1,'state')
def convertToCSR(df):
return scipy.sparse.csr_matrix(df.values)
#DataFrame to Sparse Matrix
train_nation_csr = convertToCSR(train_nation_df)
test_nation_csr = convertToCSR(test_nation_df)
train_hotel_csr = convertToCSR(train_hotel_df)
test_hotel_csr = convertToCSR(test_hotel_df)
train_city_csr = convertToCSR(train_city_df)
test_city_csr = convertToCSR(test_city_df)
train_state_csr = convertToCSR(train_state_df)
test_state_csr = convertToCSR(test_state_df)
def concatVectorizer(m1, m2):
return hstack((m1, m2))
#Concat all CSR train data
X_train_csr = concatVectorizer(concatVectorizer(train_nation_csr, train_hotel_csr), vectorizer_1.transform(X_train_1.ngram))
#Concat all CSR test data
X_test_csr = concatVectorizer(concatVectorizer(test_nation_csr, test_hotel_csr), vectorizer_1.transform(X_test_1.ngram))
"""# **PCA**"""
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
# scaler = StandardScaler(with_mean=False)
# # Fit on training set only.
# scaler.fit(X_train_csr)
# # Apply transform to both the training set and the test set.
# X_train_csr = scaler.transform(X_train_csr)
# X_test_csr = scaler.transform(X_test_csr)
# pca = TruncatedSVD(0.96)
# X_train_csr = pca.fit_transform(X_train_csr)
# X_test_csr = pca.transform(X_test_csr)
def predict_random_forest():
model = RandomForestClassifier(n_estimators=150, bootstrap = True, max_features = 'sqrt',n_jobs=-1)
model.fit(X_train_csr, y_train_1)
y_pred_random = model.predict(X_test_csr)
print(classification_report(y_test_1, y_pred_random))
print("RMSE for Neural Random Forest Classifier",sqrt(mean_squared_error(y_test_1, y_pred_random)))
def predict_Neural():
clf = MLPClassifier(hidden_layer_sizes=(50,100), max_iter=200,activation = 'logistic',solver='adam',learning_rate_init=0.0001,learning_rate='adaptive')
clf.fit(X_train_csr, y_train_1)
y_pred_neural = clf.predict(X_test_csr)
print(classification_report(y_test_1, y_pred_neural))
print("RMSE for Neural MLP Classifier",sqrt(mean_squared_error(y_test_1, y_pred_neural)))
def predict_GaussianNB():
gnb = GaussianNB()
gnb.fit(X_train_csr.todense(), y_train_1)
y_pred_GB = gnb.predict(X_test_csr.todense())
print(classification_report(y_test_1, y_pred_GB))
print("RMSE for Neural Random Naive Classifier",sqrt(mean_squared_error(y_test_1, y_pred_GB)))
def predict_SVM():
svclassifier = LinearSVR(random_state=50, max_iter=100000, epsilon=0, tol=1e-9)
svclassifier.fit(X_train_csr.todense(), y_train_1)
scv_test_predict = svclassifier.predict(X_test_csr.todense())
print(scv_test_predict)
print(classification_report(y_test_1, np.rint(scv_test_predict)))
print("RMSE for Neural Random SVR Classifier",sqrt(mean_squared_error(y_test_1, np.rint(scv_test_predict))))
predict_random_forest()
predict_Neural()
predict_GaussianNB()
predict_SVM()