GH-22: update preprocess

undertheseanlp · Dec 23, 2018 · 60b0d5e · 60b0d5e
1 parent eae809f
commit 60b0d5e
Showing 1 changed file with 36 additions and 34 deletions.
diff --git a/util/preprocess_vlsp2013/preprocess.py b/util/preprocess_vlsp2013/preprocess.py
@@ -3,39 +3,41 @@
 from os.path import dirname
 
 # Preprocess Train Data
-FOLDER = "/data/projects/undertheseanlp/word_tokenize/data/vlsp2013"
-#
-# folder1 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-1"
-# folder2 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-2"
-# count = 0
-# output_filepath = "tmp/train.txt"
-# if exists(output_filepath):
-#     remove(output_filepath)
-# output = open(output_filepath, "a")
-#
-# for file in listdir(folder1):
-#     lines = open(join(folder1, file)).readlines()
-#     lines = lines[3:-3]
-#     for line in lines:
-#         tags = ["<Date>", "</Date>", "<pTitle>", "</pTitle>", "<pHead>", "</pHead>", "<pBody>", "</pBody>", "<pAuthor>", "</pAuthor>",
-#                 "<pInterTitle>", "</pInterTitle>", "<pAnswer>", "</pAnswer>", "<pQuestion>", "</pQuestion>", "<pSuperTitle>",
-#                 "</pSuperTitle>", "<pSubTitle>", "</pSubTitle>"]
-#         is_break = False
-#         for tag in tags:
-#             if line.startswith(tag):
-#                 is_break = True
-#                 continue
-#         if is_break:
-#             continue
-#         output.write(line)
-#         count += 1
-# print("Number of sentences in Trainset-Segmentation-1 folder:", count)
-# for file in listdir(folder2):
-#     for line in open(join(folder2, file)):
-#         if line.strip():
-#             output.write(line)
-#             count += 1
-# print("Number of sentences in Trainset-Segmentation-2 folder:", count)
+FOLDER = "../../data/vlsp2013"
+
+folder1 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-1"
+folder2 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-2"
+count = 0
+output_filepath = "tmp/train.txt"
+if exists(output_filepath):
+    remove(output_filepath)
+output = open(output_filepath, "a")
+
+for file in listdir(folder1):
+    lines = open(join(folder1, file)).readlines()
+    lines = lines[3:-3]
+    for line in lines:
+        tags = ["<Date>", "</Date>", "<pTitle>", "</pTitle>", "<pHead>", "</pHead>", "<pBody>", "</pBody>", "<pAuthor>", "</pAuthor>",
+                "<pInterTitle>", "</pInterTitle>", "<pAnswer>", "</pAnswer>", "<pQuestion>", "</pQuestion>", "<pSuperTitle>",
+                "</pSuperTitle>", "<pSubTitle>", "</pSubTitle>"]
+        is_break = False
+        for tag in tags:
+            if line.startswith(tag):
+                is_break = True
+                continue
+        if is_break:
+            continue
+        output.write(line)
+        count += 1
+print("Number of sentences in Trainset-Segmentation-1 folder:", count)
+
+count = 0
+for file in listdir(folder2):
+    for line in open(join(folder2, file)):
+        if line.strip():
+            output.write(line)
+            count += 1
+print("Number of sentences in Trainset-Segmentation-2 folder:", count)
 
 # Preprocess Test Data
 count = 0
@@ -51,6 +53,6 @@
         count += 1
         content = " ".join(tokens) + "\n"
         output.write(content)
-print(count)
+print("Number of sentences in Testset:", count)