From 60b0d5e1cbe661089dcf043dca8c52586873a937 Mon Sep 17 00:00:00 2001 From: anhv Date: Sun, 23 Dec 2018 14:33:10 +0700 Subject: [PATCH] GH-22: update preprocess --- util/preprocess_vlsp2013/preprocess.py | 70 +++++++++++++------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/util/preprocess_vlsp2013/preprocess.py b/util/preprocess_vlsp2013/preprocess.py index 1889872..b8463f6 100644 --- a/util/preprocess_vlsp2013/preprocess.py +++ b/util/preprocess_vlsp2013/preprocess.py @@ -3,39 +3,41 @@ from os.path import dirname # Preprocess Train Data -FOLDER = "/data/projects/undertheseanlp/word_tokenize/data/vlsp2013" -# -# folder1 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-1" -# folder2 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-2" -# count = 0 -# output_filepath = "tmp/train.txt" -# if exists(output_filepath): -# remove(output_filepath) -# output = open(output_filepath, "a") -# -# for file in listdir(folder1): -# lines = open(join(folder1, file)).readlines() -# lines = lines[3:-3] -# for line in lines: -# tags = ["", "", "", "", "", "", "", "", "", "", -# "", "", "", "", "", "", "", -# "", "", ""] -# is_break = False -# for tag in tags: -# if line.startswith(tag): -# is_break = True -# continue -# if is_break: -# continue -# output.write(line) -# count += 1 -# print("Number of sentences in Trainset-Segmentation-1 folder:", count) -# for file in listdir(folder2): -# for line in open(join(folder2, file)): -# if line.strip(): -# output.write(line) -# count += 1 -# print("Number of sentences in Trainset-Segmentation-2 folder:", count) +FOLDER = "../../data/vlsp2013" + +folder1 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-1" +folder2 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-2" +count = 0 +output_filepath = "tmp/train.txt" +if exists(output_filepath): + remove(output_filepath) +output = open(output_filepath, "a") + +for file in listdir(folder1): + lines = open(join(folder1, file)).readlines() + lines = lines[3:-3] + for line in lines: + tags = ["", "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", + "", "", ""] + is_break = False + for tag in tags: + if line.startswith(tag): + is_break = True + continue + if is_break: + continue + output.write(line) + count += 1 +print("Number of sentences in Trainset-Segmentation-1 folder:", count) + +count = 0 +for file in listdir(folder2): + for line in open(join(folder2, file)): + if line.strip(): + output.write(line) + count += 1 +print("Number of sentences in Trainset-Segmentation-2 folder:", count) # Preprocess Test Data count = 0 @@ -51,6 +53,6 @@ count += 1 content = " ".join(tokens) + "\n" output.write(content) -print(count) +print("Number of sentences in Testset:", count)