-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_wlg_vectorised.py
269 lines (238 loc) · 18 KB
/
2_wlg_vectorised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import nltk.corpus as Corpus
import numpy as np
filename = "2_wlg_test2.csv" ##spelling check in Excel
f = open(filename,"r",encoding="utf-8")
ad_descriptions = f.read()
#ad_descriptions = re.sub(r'[\*\;]', '\n', ad_descriptions) #don't need this for chch
sent_text = sent_tokenize(ad_descriptions)
sentences_arrays = [sentence.split('\n') for sentence in sent_text]
sentences = []
for sub_list in sentences_arrays:
for sentence in sub_list:
sentences.append(sentence.strip())
#print("sentences: ", sentences)
vocab=nltk.FreqDist(sentences)
#print("Length of vocab: ", len(vocab))
freq_tuple=vocab.most_common(200)
freq_array = np.asarray(freq_tuple)
#print("the most freq words (200) freq_array:", freq_array) #help to remove agency_related contents
#print("vocab.most_common(200):", vocab.most_common(200)) #help to remove agency_related contents
df_most_common = pd.DataFrame(freq_array)
df_most_common.columns=["most_freq","freq"]
df_most_common.to_csv('2_wlg_most_common200.csv', encoding='utf-8')
df = pd.DataFrame(sentences)
df.columns = ["Ad_Desc"]
df .to_csv('2_wlg_sent_text.csv', encoding='utf-8', index = False, header=False) #save to new file. Manully classify the words.
###____________________________________________________####
f.close()
###____________________________________________________####
filename_sent_text = "2_wlg_sent_text.csv"
with open(filename_sent_text,"r",encoding="utf-8") as file:
mystring = file.readlines()
for i,line in enumerate(mystring):
for pattern in ['Cell: 027 4411 015 Office: 471 2364 or email: [email protected]',
'Deal with a professional office leasing broker.',
'Contact Tom Burke for full details of this listing & more private listings.',
'Any prices quoted are accurate at the time of listing and can be confirmed on receipt of your enquiry.',
'Features:',
'Cell: 027 4411 015 Office: 471 2364 or email: [email protected] Note: we have access to all listings incl sole listings on the market.',
'Quick Easy Search & Market Report available at www.capitalrealty.co.nz"',
'Contact Tom Burke over 20 years’ experience & over 450 leasing deals for full details of this listing & more private listings in the area.',
'Contact Tom Burke (over 20 years’ experience & over 450 leasing deals) for full details of this listing & more private listings in the area.',
'Contact Tom Burke (over 20 years’ experience & over 450 leasing deals) for full details of this listing ',
'"Contact the leasing agent, Tom Burke (over 20 years’ experience & over 450 leasing deals) for full details of this listing and more private listings and to view this tenancy."',
'Quick Easy Search & Blog/News & Market Report available at www.capitalrealty.co.nz"',
'20 years experience and over 450 deals.',
'(18 year’s experience and over 450 deals.)',
'For market news and more, go to www.capitalrealty.co.nz Our guarantee: We will never knowingly advertise a listing that is no longer available."',
'"For market news and more, go to www.capitalrealty.co.nz ""'
'Contact Tom Burke over 20 years’ experience & over 450 leasing deals for full details of this listing & more private listings.',
'& more private listings.',
'Quick Easy Search & Blog/News & Market Report all available at www.capitalrealty.co.nz"',
'Show more"',
'Contact the leasing agent, Tom Burke over 20 years’ experience & over 450 leasing deals for full details of this listing and more private listings and to view this tenancy.',
'Contact Tom Burke (over 20 years’ experience & over 450 deals) for full details of this listing ',
'Cell: 027 4411 015 Office: 471 2364 or email [email protected]',
'For market news and more, go to www.capitalrealty.co.nz"',
'FEATURES:',
'If any problem is found, it is attended to promptly.',
"*** Not what you're hunting for?",
'For market news and more, go to www.capitalrealty.co.nz',
'Our guarantee: We will never knowingly advertise a listing that is no longer available."',
'18 years experience and over 450 deals.',
'Contact Tom Burke over 25 years’ experience & over 500 leasing deals for full details of this listing & more private listings in the area.',
'"BUILDING DETAILS:',
"Can't find the right space for you?",
'If you\'d like to see the full range of Wellington CBD or fringe spaces that meet your office relocation requirements submit a brief at www.wellingtonofficespace.co.nz"',
'For further details or to view, call Calder today!"',
'We currently have other options available in the Wellington CBD and city fringe.',
'Contact us for help finding the right environment for you and your business."',
'Mark Melville 022 154 6558 - mailto:[email protected]',
'Carl Hastings 021 403 502 - mailto:[email protected]',
'For more information or to view, please contact the agent."',
'Jeremy Langford on 021 278 0700 or email [email protected]',
'Not what you are after?',
'Mobile: 027 296 5989',
'For further information and to arrange an inspection call Terry on:',
'Key features of this property include:',
'For further information contact:',
'For more information contact:',
'For more information contact the master agents:',
'For more information please contact:',
'Call Luke Kershaw today to discuss - 021 610 093',
'For more information or to arrange an inspection contact:',
'SPACE DETAILS:',
'Contact details:',
'Mark Melville 022 154 6558 - mailto:[email protected] or',
'Mark Melville (022 154 6558) - mailto:[email protected] or',
'For further detail and to arrange an inspection call Terry on:',
'For more information and to arrange an inspection call Terry on:',
'For further information call Terry on:',
'Cell: 027 4411 015 or email: [email protected]',
'"Email: [email protected]"""',
'Paul Soulis',
'Phone 027 4411 015 or email: [email protected] Note: we have access to all listings – over 750 incl sole listings on the market.',
'Call Matthew today and let us help you find the perfect space for you!"',
'Key Features:',
'Price by negotiation.',
'Contact Tom Burke over 20 years’ experience & over 450 deals for full details of this listing & more private listings.',
'If you\'d like to see the full range of Wellington CBD or fringe spaces that meet your office relocation requirements give Luke a call today."',
'& more private listings.',
'Carl Hastings 021 403 502 - carl@the agencygroup.co.nz"',
'Carl Hastings (021 403 502) - carl@the agencygroup.co.nz',
'Quick Easy Search & up to date Office Market Report available at www.capitalrealty.co.nz"',
'Carl Hastings 021 403 502 - carl@the agencygroup.co.nz',
'For more information on this opportunity or any others please contact Jeremy Langford – Wellington Office Leasing specialist – 021-2780-700.',
'Steve Maitland on 021 726 200 or email [email protected]',
"We'll be dealing direct with the owner who is very keen to do a deal!",
'Quick Easy Search & Market News available at www.capitalrealty.co.nz"',
'Property Details:',
'For more information about this space or to arrange a viewing, please contact:',
'Carl Hastings 021 403 502 - [email protected]"',
'Email: [email protected]"',
'20 years’ experience and over 450 deals.',
'Office: 04 474 1585',
'OFFICE SPACE DETAILS:',
'Phone 027 4411 015 or email: [email protected]',
'Contact Tom Burke over 20 years’ experience & over 450 leasing deals for full details of this listing',
'Contact Tom Burke (over 20 years’ experience & over 450 leasing deals) for full details of this listing ',
'"Call Tom Burke on 027 4411015 to view"""',
'"Ring Tom Burke on 471 2364 now for full details."""',
'"Ring Tom Burke on 027 4411 015 to view."""',
'Over 20 years’ experience and over 450 deals.',
'KEY FEATURES:',
'For more information about this space or other suitable space and to arrange a viewing, please contact:',
'Call Tom Burke now on 471 2364 to discuss this and other options in the building.',
'Contact Tom Burke for full details of this listing ',
'Call Tom Burke 027 4411 015 for full details',
' Note: we have access to all listings (incl sole listings) on the market.',
'"For market news and more, go to www.capitalrealty.co.nz ""',
'"Contact the leasing agent, Tom Burke (over 20 years’ experience & over 450 leasing deals) for full details of this listing and more private listings and to view this tenancy."',
'Email: [email protected]"',
'Mobile: 027 296 5989',
'Email: [email protected]"',
'See photos for floor plan',
'Over 20 years experience and over 450 deals.',
'Looking for cheap?',
'Key attributes:',
'For more information please contact:',
'For more information or to arrange an inspection contact Alastair Gustafson on 027 223 6013 or email mailto:[email protected] A member of the specialist CBD team.',
'Please contact Andrew Fullerton-Smith on 021 896 060 for more information.',
'Call Matt Clarke on 027 4409 608 to arrange inspection',
'Evan Price on 027 448 4199 or email mailto:[email protected]',
'Steve Maitland on 021 726 200 or email mailto:[email protected]',
'Jeremy Langford on 021 278 0700 or email mailto:[email protected]',
'More information: https://www.colliers.co.nz',
'Call Matthew today and let us help you find the perfect space for you!',
'For more information or to arrange an inspection contact Alastair Gustafson on 027 223 6013 or email mailto:[email protected]',
'For more information or to arrange an inspection contact Alastair Gustafson on 027 223 6013 or',
'COME VIEW TODAY For more information or to arrange an inspection contact Alastair Gustafson on 027 223 6013 or A member of the specialist CBD team.',
'email mailto:[email protected]',
'cbre.co.nz',
"If you'd like to see the full range of Wellington CBD or fringe spaces that meet your office relocation requirements submit a brief at http://www.wellingtonofficespace.co.nz """
]:
if pattern in line:
mystring[i] = line.replace(pattern,"")
# print the processed lines
remove_agent_related_info = "".join(mystring)
#print("remove_agent_related_info: ",remove_agent_related_info) #agency_related contents removed as much as possible
cleantext = re.sub(r'[^\w\s]', ' ', remove_agent_related_info) #remove if ^not \w(letters, numbers) + \s(whitespaces)
#print("cleantext:",cleantext)
words = word_tokenize(cleantext) #tokenize words
#print("words:",words)
cleantext_no_digits = ' '.join([w for w in word_tokenize(cleantext) if w.isalpha()]) #remove all digits
#print("cleantext_no_digits:",cleantext_no_digits)
stop_words=stopwords.words('english')
additional_stopwds = ['youre', 'youve','youll','youd','shes','its','thatll','dont','shouldve','arent','couldnt',
'didnt','doesnt','hadnt','hasnt','havent','isnt','mightnt','mustnt','neednt','shant','shouldnt',
'wasnt','werent','wont','wouldnt','us','dont','would','sqm','sq','sqmtr','sqr','mtr','weve','theres',
'cant','th','sm','psm','whats','mtrs','thats','level','levels','meter','metre','sqmt','gst','pa','andor',
'however','today','call','available','ready','per','email','one','two','three','also','contact','approx',
'approximately','nla','gla','day','floor','floors','via','show','please','well','address','could',
'listing','id','still','touch','find','finding','found','able','st','yet','always','almost','although',
'among','anytime','call','rd','every','many','along','already','annum', 'become','come',
'either','etc','ever','everything','forward','whilst','whether','hesitate', 'regarding',
'looking', 'offering','instagram','offered','facebook','become','linkedin','sqmof',
'inspect','information','info','give','include','includes','including','advice','need','provide','provides',
'provided','providing','may','needs','must','love','unbiased','size','help','negotiation','metrocommercial','www','http']
stop_words.extend(additional_stopwds)
cleantext_no_stopwds = [w for w in word_tokenize(cleantext_no_digits) if not w.lower() in stop_words] #remove stopwords
#print("cleantext_no_stopwds:",cleantext_no_stopwds)
lemmatizer = WordNetLemmatizer()
cleantext_lemmatized = ' '.join([lemmatizer.lemmatize(w) for w in cleantext_no_stopwds]) #covert a word to its base form. e.g. walks to walk
#print("cleantext_lemmatized:",cleantext_lemmatized)
## try to print the most freq words, but not working
##myCorpus = Corpus([lemmatizer.lemmatize(w) for w in cleantext_no_stopwds])
##Most_freq_100 = nltk.FreqDist(myCorpus)
##print("Most_freq_100:",Most_freq_100.most_common(100))
##myCorpus.dispersion_plot(Most_freq_100)
##-----------------------------------------------Bag of Words https://www.youtube.com/watch?v=8JcLENGoXL0&list=PLP_4EPVEox99f1-_JMRpQbPkcGlMd5Xoq&index=3----------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option("display.max_rows", None, "display.max_columns", None) ##set this to display all results
count_vec = CountVectorizer(analyzer='word', ngram_range=(2,3))
count_occurs = count_vec.fit_transform([cleantext_lemmatized])
feature_names = count_vec.get_feature_names() ##in array
feature_names_txt ='\n'.join([str(item) for item in feature_names]) ##in str
#print(feature_names_txt2)
count_occur_df = pd.DataFrame(
(count, word) for word, count in
zip(count_occurs.toarray().tolist()[0],
count_vec.get_feature_names_out()))
count_occur_df.columns = ['Word2','Word_Freq']
count_occur_df.sort_values('Word_Freq', ascending=False,inplace=True)
#print("count_occur_df: ",count_occur_df)
#print(count_occur_df2.head())
count_occur_df.to_csv('2_wlg_BagofWords.csv', encoding='utf-8', index=False)
##----------------------------------------------TF-iDF----------------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option("display.max_rows", None, "display.max_columns", None) ##set this to display all results
TfidfVectorizer = TfidfVectorizer(analyzer='word', ngram_range=(2,3))
vectors = TfidfVectorizer.fit_transform([cleantext_lemmatized])
#print("vectors", vectors)
wd_freq = vectors.toarray()
#print("wd_freq: ", wd_freq)
feature_names_vectors = TfidfVectorizer.get_feature_names() ##in array
#print("feature_names_vectors: ",feature_names_vectors)
vectors_df = pd.DataFrame(
(count, word) for word, count in
zip(wd_freq.tolist()[0],
feature_names_vectors))
vectors_df.columns = ['Words','TF_iDF']
vectors_df.sort_values('TF_iDF', ascending=False,inplace=True)
#print(vectors_df.head())
#print("vectors_df:", vectors_df)
vectors_df.to_csv('2_wlg_TF_iDF_keywords.csv', encoding='utf-8', index=False)
##----------------------------------------------Text Classification [Naive Bayes Classifier]----------------------------------------------
from sklearn import model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
file.close()
print("2_wlg_vectorised run complete 100% +++++++++++++++++++++++++++++++++++++++")