Skip to content

Commit

Permalink
Upgrades
Browse files Browse the repository at this point in the history
  • Loading branch information
otaviodantas committed Oct 16, 2020
1 parent daaf600 commit 7f2e521
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 38 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
./teste.py
.pyc*
2 changes: 1 addition & 1 deletion aux_mod.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
nltk.download("stopwords")


def TokenData(data) -> List[str]:
def TokenData(data: str) -> List[str]:
token_space = tokenize.WhitespaceTokenizer()
all_words_token = token_space.tokenize(data)
return all_words_token
Expand Down
2 changes: 1 addition & 1 deletion call_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from stream import MyStreamListener


def calling_stream(wordkey, api):
def calling_stream(wordkey: str, api: tweepy):
twitter_listener = MyStreamListener(api, wordkey)
stream = tweepy.Stream(api.auth, twitter_listener)
stream.filter(track=[wordkey], languages=["pt"])
1 change: 1 addition & 0 deletions config.env
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# WORDKEY is a variables that will be searched

WORDKEY=

CONSUMER_KEY=
CONSUMER_SECRET_KEY=
ACESS_TOKEN=
Expand Down
9 changes: 7 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import os
from dotenv import load_dotenv

from auth import create_api
import sys

sys.path.insert(1, "./all_module/call_stream")
sys.path.insert(1, "./all_module/auth")

from call_stream import calling_stream
from auth import create_api

load_dotenv('config.env')
load_dotenv("wordkey.env")

if __name__ == "__main__":
wordkey = os.getenv("WORDKEY")
Expand Down
1 change: 0 additions & 1 deletion stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def on_status(self, tweet):
return
else:
messy_data = self.trash.clean(tweet.text)
print(messy_data)
TransformToCSV(messy_data)

def on_error(self, status):
Expand Down
67 changes: 34 additions & 33 deletions twitter_data_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,17 @@
from aux_mod import TokenData


class TwitterDataCleaner():

class TwitterDataCleaner:
def __init__(self, list_stopword: List[str]):
self.stopword = list_stopword

def clean(self, data):
def clean(self, data: str):
cleaned_data = self.__remove_stopwords(data)
cleaned_data = self.__remove_user(cleaned_data)
cleaned_data = self.__remove_URL(cleaned_data)
cleaned_data = self.__remove_emoji(cleaned_data)
cleaned_data = self.__remove_hashtag(cleaned_data)
cleaned_data = self.__remove_punct(cleaned_data)
cleaned_data = self.__remove_hashtag(cleaned_data)
return cleaned_data

def __remove_stopwords(self, data: str) -> str:
Expand All @@ -31,50 +29,53 @@ def __remove_stopwords(self, data: str) -> str:
if not_accent not in self.stopword:
list_clean_data.append(word)

phrase_complete = ' '.join(list_clean_data)
phrase_complete = " ".join(list_clean_data)
return phrase_complete

def __remove_user(self, data: str) -> str:
pattern = re.compile(r'@\w+: |@\w+|@\w+ ')
data_without_user = pattern.sub(r'', str(data))
pattern = re.compile(r"@\w+: |@\w+|@\w+ ")
data_without_user = pattern.sub(r"", str(data))
return data_without_user

def __remove_URL(self, data: str) -> str:
url = re.compile(r'https?://\S+|www\.\S+|https+|tco?')
data_without_url = url.sub(r'', data)
url = re.compile(r"https?://\S+|www\.\S+|https+|tco?")
data_without_url = url.sub(r"", data)
return data_without_url

def __remove_emoji(self, data: str) -> str:
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags=re.UNICODE)
emoji_pattern = re.compile(
"["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+",
flags=re.UNICODE,
)

data_without_emoji = emoji_pattern.sub(r'', str(data))
data_without_emoji = emoji_pattern.sub(r"", str(data))
return data_without_emoji

def __remove_punct(self, data: str) -> str:
table = str.maketrans('', '', string.punctuation)
table = str.maketrans("", "", string.punctuation)
data_without_punct = data.translate(table)
return data_without_punct

def __remove_hashtag(self, data: str) -> str:
pattern = re.compile(r'#\w+')
data_without_hashtag = pattern.sub(r'', str(data))
pattern = re.compile(r"#\w+")
data_without_hashtag = pattern.sub(r"", str(data))
return data_without_hashtag

0 comments on commit 7f2e521

Please sign in to comment.