forked from lshang0311/pandas-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_preprocessing.py
41 lines (31 loc) · 1.21 KB
/
text_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import nltk
import string
"""
https://stackoverflow.com/questions/33098040/how-to-use-word-tokenize-in-data-frame
https://stackoverflow.com/questions/33245567/stopword-removal-with-nltk-and-pandas/33246035
https://medium.com/@chaimgluck1/have-messy-text-data-clean-it-with-simple-lambda-functions-645918fcc2fc
"""
nltk.download('punkt')
nltk.download('stopwords')
# input
df = pd.DataFrame({
'sentences':
[
'This is a very good site. I will recommend it to others.',
'Can you please give me a call at 9983938428. have issues with the listings.',
'good work! keep it up'
]
})
# lower case
df['lower_case'] = df['sentences'].apply(lambda x: x.lower())
table = str.maketrans({}.fromkeys(string.punctuation))
df['punctuation_removed'] = df['sentences'].apply(lambda x: x.translate(table))
# tokenize
df['tokenized_1'] = df.apply(lambda row: nltk.word_tokenize(row['sentences']), axis=1)
# tokenize
df['tokenized_2'] = df['sentences'].apply(nltk.word_tokenize)
# stopword removal
stop = nltk.corpus.stopwords.words('english')
df['stopwords_removed'] = df['tokenized_1'].apply(lambda x: [item for item in x if item not in stop])
print(df)