-
Notifications
You must be signed in to change notification settings - Fork 0
/
mypreprocessing.py
100 lines (84 loc) · 3.02 KB
/
mypreprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import re
import string
from nltk.corpus import stopwords
import contractions
import nltk
from num2words import num2words
import tweepy
#connection to Twitter API using tweepy
consumer_key = "CtmrTSUsnIRHKSgj4ktqK4NKb"
consumer_secret = "1MNfKmptTsrJcq7WMYxOoVwEgFCK5DNMQeC43IZDkPucPObCnZ"
access_key = "1231240089600057344-rqYnkeVvMaB1XwfvrERfI7aF38r69L"
access_secret = "KmRfAoexHxfkHKJaC3hwB0sgtfocyz58IqdzYiWeaXSGO"
# Authorization to consumer key and consumer secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# Access to user's access key and access secret
auth.set_access_token(access_key, access_secret)
# Calling api
api = tweepy.API(auth)
def get_username(username):
# 200 tweets to be extracted
#number_of_tweets=1
try:
tweets = api.user_timeline(screen_name=username)
#tweets_for_csv = [tweet.text for tweet in tweets] # CSV file created
for tweet in tweets:
#print(tweet)
return tweet._json['user']['name']
except tweepy.error.TweepError:
return None
english_stopwords=stopwords.words('english')
def preprocess(temp):
# expand using contractions
temp=re.sub(r"(http|https)\S+", "", temp)
temp = contractions.fix(temp)
# tokenize
tokens=nltk.word_tokenize(temp)
#tokens = tokenizer.tokenize(temp)
# for i,token in enumerate(tokens):
# if token[0].isupper():
# print("{}:{}:{}".format(i,file,token))
string.punctuation = string.punctuation + "''``--"
#print(tokens)
new_tokens = []
pattern = ('\d+(\.\d+)?')
for i, token in enumerate(tokens):
if i==len(tokens):
break
if(token=='@'):
#replace twitter handle with screen name
temp=tokens[i]+tokens[i+1]
#print(temp)
if get_username(temp) is not None:
#print(get_username(temp))
temp_list=nltk.word_tokenize(get_username(temp))
for t in temp_list:
new_tokens.append(t.lower())
i=i+2
continue
if len(token)<3:
continue
if token in english_stopwords:
continue
if (token not in string.punctuation):
if not re.match(pattern, token):
for s in token:
if s in string.punctuation:
token = token.replace(s, '')
else:
if token.isdigit():
token = num2words(float(token))
new_tokens.append(token.lower().encode("ascii", errors="ignore").decode())
'''
# lemmitization
lemmatizer = nltk.WordNetLemmatizer()
final_tokens=[]
for token in new_tokens:
final_tokens.append(lemmatizer.lemmatize(token))
# stemmer
stemmer = nltk.PorterStemmer()
final_tokens = []
for token in new_tokens:
final_tokens.append(stemmer.stem(token))
'''
return new_tokens