-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataclean.py
50 lines (38 loc) · 1.49 KB
/
dataclean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
import numpy as np
import re
import sys
def remove_tags(row):
row = str(row)
cleanr = re.compile('(</?[a-zA-Z]+>|https?:\/\/[^\s]*|(^|\s)RT(\s|$)|@[^\s]+|\d+)')
cleantext = re.sub(cleanr, ' ', row)
cleantext = re.sub('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)',' ',cleantext)
cleantext = re.sub('[^\sa-zA-Z]+','',cleantext)
cleantext = re.sub('\s+',' ',cleantext)
cleantext = cleantext[1:].strip()
return cleantext
def clean_text(row):
row = str(row).encode('ascii', 'ignore').strip()
row = remove_tags(row)
return row
# train_data_path = r"data/training-Obama-Romney-tweets.xlsx"
#
# obama_df = pd.read_excel(train_data_path, sheetname=0)
# romney_df = pd.read_excel(train_data_path, sheetname=1)
#
# obama_df['tweet'] = obama_df['tweet'].apply(clean_text)
# romney_df['tweet'] = romney_df['tweet'].apply(clean_text)
#
# print("Writing cleaned training output")
# obama_df.to_csv('data/obama_csv.csv', sep='\t')
# romney_df.to_csv('data/romney_csv.csv', sep='\t')
#
# print("Writing cleaned testing output")
test_data_path = r"data/testing-Obama-Romney-tweets.xlsx"
obama_df = pd.read_excel(test_data_path, sheetname=0)
romney_df = pd.read_excel(test_data_path, sheetname=1)
obama_df['tweet'] = obama_df['tweet'].apply(clean_text)
romney_df['tweet'] = romney_df['tweet'].apply(clean_text)
obama_df.to_csv('data/obama_csv_test.csv', sep='\t')
romney_df.to_csv('data/romney_csv_test.csv', sep='\t')
print("Writing cleaned testing output")