forked from yoconana/Information-Retrieval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
RemovedString.py
81 lines (65 loc) · 2.27 KB
/
RemovedString.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Author: Yuanwei Wu
# Date: 3/2/2016, 1st version
# Date: 3/5/2016, 2nd version: add the unicode (line24) fixed the EncoderError
# add str() at line54 to fix TypeError: expected a character buffer object
# Description: IR Project: part 1, Document processing and indexing
# Tokenize, remove stop word, stemming
# using NLTK package
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#from nltk.stem.lancaster import LancasterStemmer
# there are many stemmers in nltk, here I use PorterStemmer
from nltk.stem import PorterStemmer
import os
import codecs # otherwise, it has ascii encoding error
ps = PorterStemmer()
def RemoveStopwdStem(inputfolder, outputfolder, url):
#os.chdir("./cleaned")
init_file = open(inputfolder+"/"+url,'r')#this has unicode error
# init_file = codecs.open(inputfolder+"/"+url,encoding='utf-8')
init_word = init_file.read()
init_file.close()
# tokenize
tokenized_word = word_tokenize(init_word)
#print(tokenized_word)
#tokenizedword = len(tokenized_word)
#print(tokenizedword)
# remove stop words
stop_words = set(stopwords.words("english"))
removedstop_word = []
removedstop_word = [w for w in tokenized_word if not w in stop_words]
#print(removedstop_word)
#lengthstopword = len(removedstop_word)
#print(lengthstopword)
# stemming
stemmed_words = []
for w in removedstop_word:
try:
stemmed_words.append(str(ps.stem(w)))
except UnicodeDecodeError:
print w
outcome = ' '.join(stemmed_words)
#print(stemmed_words)
#stemmedword = len(stemmed_words)
#print(stemmedword)
# There are some problems on the encoding, I need to figure it out.
# mid_file = codecs.open(outputfolder+"/"+url, "w")
mid_file = open(outputfolder+"/"+url+".removed", "w")
mid_file.write(outcome)
mid_file.close()
# text_file = codecs.open(outputfolder+"/"+url+".removed", "w")
# text_file = open(outputfolder+"/"+url+".removed", "w")
# # text_file.write(text.encode('utf-8'))
# text_file.close()
# os.remove(outputfolder+"/"+url)
"""
for filename in os.listdir('cleaned/'):
print(filename)
if ".DS_Store" == filename:
continue;
RemoveStopwdStem("cleaned","Removed",filename)
print("Done.")
"""
# test the 1st file in cleaned/
# filename = 'Acadia_National_Park.htm.cleaned'
# RemoveStopwdStem("cleaned","Removed",filename)