Merge pull request #2 from Patrick-Lapid/tokenization

Allows multiple txt files
ufosc · Sep 17, 2021 · 5e2e409 · 5e2e409
2 parents 845b7f2 + 9a05ae8
commit 5e2e409
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 6 deletions.
diff --git a/preprocessing/test.py b/preprocessing/test.py
@@ -2,14 +2,22 @@
 # nltk.download('punkt')
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.util import filestring
+usrInput = input("Enter file name: ")
+fileNames = [usrInput]
+
+
+while usrInput != "":
+    fileNames.append(usrInput)
+    usrInput = input("Enter file name (ENTER to terminate): ")
+
 
-fileName = input("Enter File Name: ")
 try: 
-    with open(fileName, encoding='utf-8') as file:
-        text = file.read()
-        words = word_tokenize(text)
-        sentences = sent_tokenize(text)
-        print('Words: ', words, '\nSentences: ', sentences)
+    for fileName in fileNames:
+        with open(fileName, encoding='utf-8') as file:
+            text = file.read()
+            words = word_tokenize(text)
+            sentences = sent_tokenize(text)
+            print('Words: ', words, '\nSentences: ', sentences)
 
 except IOError as e:
     print("I/O error({0}): {1}".format(e.errno, e.strerror))

diff --git a/preprocessing/test2.txt b/preprocessing/test2.txt
@@ -0,0 +1,3 @@
+apple
+orange
+banana
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    apple
+    orange
+    banana