forked from jeochris/App-Review-Sentiment-Summarization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
93 lines (74 loc) · 3.19 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from konlpy.tag import Kkma, Okt
import pandas as pd
class Preprocess(object):
def __init__(self, review_dict_total):
self.okt = Okt()
self.kkma = Kkma()
self.review_dict_total = review_dict_total
self.result_sentences = []
self.result_sentences_df = None
def CustomTokenizer(self, corpus, stop = ['Josa','Suffix','Punctuation', 'Foreign', 'Number']):
tokenized = []
for i, j in self.okt.pos(corpus, stem = True, norm = True):
if j in stop:
continue
tokenized.append(i)
return tokenized
#text를 입력받아 Kkma.sentences()를 이용해 문장단위로 나눈 뒤 sentences로 리턴
def text2sentences(self, text):
sentences = self.kkma.sentences(text) #text일 때 문장별로 리스트 만듦
for idx in range(1, len(sentences)): #길이에 따라 문장 합침(위와 동일)
# 문장의 품사가 5개 미만이면 생략
if len(self.CustomTokenizer(sentences[idx])) <= 10:
if sentences[idx-1] == '':
if sentences[idx-2] == '':
sentences[idx-3] += (''+sentences[idx])
sentences[idx] = ''
sentences[idx-2] += ('' + sentences[idx])
sentences[idx] = ''
sentences[idx-1] += ('' + sentences[idx])
sentences[idx] = ''
for idx in range(0, len(sentences)):
word_list = []
# ['Punctuation', 'Foreign'] 제거
for word, pos in self.okt.pos(sentences[idx], norm = True):
stop = ['Punctuation', 'Foreign']
if pos in stop:
word = ""
# 문장을 예쁘게 하기 위해 임시로 ';' 붙여두기
elif pos == 'Suffix' or pos == 'Josa':
word = word + ';'
word_list.append(word)
# 문장 클리닝
sentence = ''
for word in word_list:
if word == '':
continue
elif ';' in word:
sentence += word[:-1]
else:
sentence += " " + word
sentences[idx] = sentence
# 공란 문장 제거
sentences__ = []
for sentence in sentences:
if sentence == '':
pass
else:
sentences__.append(sentence)
return sentences__
def preprocess(self):
tot_raw_reivew = []
for item in self.review_dict_total.values():
tot_raw_reivew.append(item['review'])
for i in range(len(tot_raw_reivew)):
changed_sentences = self.text2sentences(tot_raw_reivew[i])
for sentence in changed_sentences:
if len(sentence) <= 15:
continue
else:
self.result_sentences.append(sentence)
print('preprocessed sentences :', len(self.result_sentences))
print('first sentence :', self.result_sentences[0])
self.result_sentences_df = pd.DataFrame(self.result_sentences)
self.result_sentences_df.columns = ['Review']