forked from steveyx/trending-stories
-
Notifications
You must be signed in to change notification settings - Fork 0
/
news_cleaner.py
82 lines (71 loc) · 3.18 KB
/
news_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import unicodedata
class NewsCleaner:
non_en_chars = {
"’": "'",
"‘": "'"
}
remove_from_title = ["BREAKING:", "[\-:] report", "[\-:] sources", "[\-:] source", "source says", "Exclusive\:",
"Factbox\:", "Timeline[\-:]", "Instant View\:", "Explainer\:", ": Bloomberg",
": WSJ"]
remove_if_start_with = ['close breaking news']
replace_if_contain = ['click here to see']
@staticmethod
def clean_news_article(title="", content=""):
_c_title = NewsCleaner.clean_title(title)
_c_content = NewsCleaner.clean_content(content)
return _c_title, _c_content
@staticmethod
def clean_title(raw_title):
txt = NewsCleaner.remove_non_en_chars(raw_title)
txt = NewsCleaner.replace_abbreviation(txt)
p = "|".join(NewsCleaner.remove_from_title)
txt = re.sub(p, '', txt)
txt = re.sub(r'\s+', " ", txt).strip()
return txt
@staticmethod
def clean_content(raw_content):
if not raw_content:
return ""
remove_if_start_with = NewsCleaner.remove_if_start_with
replace_if_contain = NewsCleaner.replace_if_contain
txt = NewsCleaner.remove_non_en_chars(raw_content)
txt = NewsCleaner.replace_abbreviation(txt)
lines = txt.split("\n")
# remove short sentence
clean1 = [l for l in lines if len(l) > 30]
clean2 = [l for l in clean1 if len(l.split(" ")) > 3]
# remove the words if the sentence starts with certain pattern
p_start = r"^ *" + r"|^ *".join(remove_if_start_with)
clean3 = [l for l in clean2 if re.match(p_start, l) is None]
# remove the words if the sentence contains certain pattern
p_contain = r"|".join(replace_if_contain)
clean4 = [re.sub(p_contain, '', l) for l in clean3]
clean5 = " ".join(clean4)
return re.sub(r'\s+', ' ', clean5).strip()
@staticmethod
def remove_non_en_chars(txt):
# remove non english characters
txt = NewsCleaner.convert_latin_chars(txt)
for char in NewsCleaner.non_en_chars.keys():
txt = re.sub(char, NewsCleaner.non_en_chars[char], txt)
txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
return txt
@staticmethod
def convert_latin_chars(txt):
# convert latin characters
return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')
@staticmethod
def replace_abbreviation(txt):
words = {'U.S.': 'US'}
for w in words.keys():
txt = re.sub(w, words[w], txt)
return txt
if __name__ == "__main__":
title = "Facebook's Zuckerberg to testify before Congress: source"
content = "Facebook Inc Chief Executive Mark Zuckerberg plans to testify before U.S. Congress, a source briefed " \
"on the matter said on Tuesday, as he bows to pressure from lawmakers insisting he explain how 50 " \
"million users' data ended up in the hands of a political consultancy."
cleaned_title, cleaned_content = NewsCleaner.clean_news_article(title=title, content=content)
print(cleaned_title)
print(cleaned_content)