forked from lbechberger/MLinPractice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
punctuation_remover.py
29 lines (24 loc) · 1007 Bytes
/
punctuation_remover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Preprocessor that removes punctuation from the original tweet text.
"""
import string
from src.preprocessing.preprocessors.preprocessor import Preprocessor
from src.util import COLUMN_TWEET, COLUMN_PUNCTUATION
# removes punctuation from the original tweet
# inspired by https://stackoverflow.com/a/45600350
class PunctuationRemover(Preprocessor):
# constructor
def __init__(self):
# input column "tweet", new output column
super().__init__([COLUMN_TWEET], COLUMN_PUNCTUATION)
# set internal variables based on input columns
def _set_variables(self, inputs):
# store punctuation for later reference
self._punctuation = "[{}]".format(string.punctuation)
# get preprocessed column based on data frame and internal variables
def _get_values(self, inputs):
# replace punctuation with empty string
column = inputs[0].str.replace(self._punctuation, "")
return column