Skip to content

Commit

Permalink
Merge pull request #2 from Patrick-Lapid/tokenization
Browse files Browse the repository at this point in the history
Allows multiple txt files
  • Loading branch information
Patrick-Lapid authored Sep 17, 2021
2 parents 845b7f2 + 9a05ae8 commit 5e2e409
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
20 changes: 14 additions & 6 deletions preprocessing/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,22 @@
# nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import filestring
usrInput = input("Enter file name: ")
fileNames = [usrInput]


while usrInput != "":
fileNames.append(usrInput)
usrInput = input("Enter file name (ENTER to terminate): ")


fileName = input("Enter File Name: ")
try:
with open(fileName, encoding='utf-8') as file:
text = file.read()
words = word_tokenize(text)
sentences = sent_tokenize(text)
print('Words: ', words, '\nSentences: ', sentences)
for fileName in fileNames:
with open(fileName, encoding='utf-8') as file:
text = file.read()
words = word_tokenize(text)
sentences = sent_tokenize(text)
print('Words: ', words, '\nSentences: ', sentences)

except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
Expand Down
3 changes: 3 additions & 0 deletions preprocessing/test2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
apple
orange
banana

0 comments on commit 5e2e409

Please sign in to comment.