-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcorpus.py
96 lines (82 loc) · 2.31 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import string
import re
def tokenize(text):
"""
Tokenize text based on several parameters using Reg expression
(punctuation, capitalization, contraction etc)
Parameters
----------
text: string
Takes each lines from the train data
Return
------
list
Tokenized string in a list form
"""
text = text.replace('--', ' ')
text = clean_regex(text)
#punctuation_table = dict((ord(char), None) for char in string.punctuation)
#text = text.translate(punctuation_table)
# remove certain punctuation chars
text = re.sub("[()]", "", text)
# collapse multiples of certain chars
text = re.sub('([.-])+', r'\1', text)
if text[-1] in ".,!?":
text = text[:-1]+" "+text[-1]
# pad sentence punctuation chars with whitespace
text = re.sub('([^0-9])([.,!?])([^0-9])', r'\1 \2 \3', text)
tokens = text.split()
#tokens = [word for word in tokens if word.isalpha()]
tokens = [word.lower() for word in tokens]
return tokens
def clean_regex(text):
"""
Clean contraction from the text
Parameters
----------
text: string
Text lines of string
Return
------
string
Cleaned contraction
"""
text = re.sub(r"i'm", "i am", text)
text = re.sub(r"he's", "he is", text)
text = re.sub(r"she's", "she is", text)
text = re.sub(r"that's", "that is", text)
text = re.sub(r"what's", "what is", text)
text = re.sub(r"where's", "where is", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"won't", "will not", text)
text = re.sub(r"can't", "can not", text)
return text
def detokenize(tokens):
"""
Detokenize list of tokens based on parameters using regex (punctuation, capitalization,
padding etc)
Parameters
----------
tokens: list
list of tokens generated
Return
------
string
As a form of text
"""
tokens = list(filter(None.__ne__, tokens))
text = ' '.join(tokens)
# correct whitespace padding around punctuation
text = re.sub('\\s+([.,!?])\\s*', r'\1 ', text)
# capitalize first letter
text = text.capitalize()
# capitalize letters following terminated sentences
text = re.sub('([.!?]\\s+[a-z])', lambda c: c.group(1).upper(), text)
return text
#tokenizer = tokenize("[will], not you.")
#print(tokenizer)
#detokenizer = detokenize(["to","john",".","hey","come","here"])
#print(detokenizer)