Skip to content
This repository has been archived by the owner on Feb 15, 2023. It is now read-only.

Commit

Permalink
GH-22: update preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
rain1024 committed Dec 23, 2018
1 parent eae809f commit 60b0d5e
Showing 1 changed file with 36 additions and 34 deletions.
70 changes: 36 additions & 34 deletions util/preprocess_vlsp2013/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,41 @@
from os.path import dirname

# Preprocess Train Data
FOLDER = "/data/projects/undertheseanlp/word_tokenize/data/vlsp2013"
#
# folder1 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-1"
# folder2 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-2"
# count = 0
# output_filepath = "tmp/train.txt"
# if exists(output_filepath):
# remove(output_filepath)
# output = open(output_filepath, "a")
#
# for file in listdir(folder1):
# lines = open(join(folder1, file)).readlines()
# lines = lines[3:-3]
# for line in lines:
# tags = ["<Date>", "</Date>", "<pTitle>", "</pTitle>", "<pHead>", "</pHead>", "<pBody>", "</pBody>", "<pAuthor>", "</pAuthor>",
# "<pInterTitle>", "</pInterTitle>", "<pAnswer>", "</pAnswer>", "<pQuestion>", "</pQuestion>", "<pSuperTitle>",
# "</pSuperTitle>", "<pSubTitle>", "</pSubTitle>"]
# is_break = False
# for tag in tags:
# if line.startswith(tag):
# is_break = True
# continue
# if is_break:
# continue
# output.write(line)
# count += 1
# print("Number of sentences in Trainset-Segmentation-1 folder:", count)
# for file in listdir(folder2):
# for line in open(join(folder2, file)):
# if line.strip():
# output.write(line)
# count += 1
# print("Number of sentences in Trainset-Segmentation-2 folder:", count)
FOLDER = "../../data/vlsp2013"

folder1 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-1"
folder2 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-2"
count = 0
output_filepath = "tmp/train.txt"
if exists(output_filepath):
remove(output_filepath)
output = open(output_filepath, "a")

for file in listdir(folder1):
lines = open(join(folder1, file)).readlines()
lines = lines[3:-3]
for line in lines:
tags = ["<Date>", "</Date>", "<pTitle>", "</pTitle>", "<pHead>", "</pHead>", "<pBody>", "</pBody>", "<pAuthor>", "</pAuthor>",
"<pInterTitle>", "</pInterTitle>", "<pAnswer>", "</pAnswer>", "<pQuestion>", "</pQuestion>", "<pSuperTitle>",
"</pSuperTitle>", "<pSubTitle>", "</pSubTitle>"]
is_break = False
for tag in tags:
if line.startswith(tag):
is_break = True
continue
if is_break:
continue
output.write(line)
count += 1
print("Number of sentences in Trainset-Segmentation-1 folder:", count)

count = 0
for file in listdir(folder2):
for line in open(join(folder2, file)):
if line.strip():
output.write(line)
count += 1
print("Number of sentences in Trainset-Segmentation-2 folder:", count)

# Preprocess Test Data
count = 0
Expand All @@ -51,6 +53,6 @@
count += 1
content = " ".join(tokens) + "\n"
output.write(content)
print(count)
print("Number of sentences in Testset:", count)


0 comments on commit 60b0d5e

Please sign in to comment.