Skip to content
This repository has been archived by the owner on Feb 15, 2023. It is now read-only.

Commit

Permalink
GH-22: add preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
rain1024 committed Dec 24, 2018
1 parent 32ea052 commit 9fa9611
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 33 deletions.
3 changes: 3 additions & 0 deletions egs/vlsp2013_crf/crf_sequence_tagger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class CRFSequenceTagger():
def __init__(self, features=None):
self.features = features
39 changes: 39 additions & 0 deletions egs/vlsp2013_crf/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from os.path import join

class WordTokenizeCorpusReader:
@staticmethod
def read(data_folder, train_file=None, test_file=None):
train = WordTokenizeCorpusReader.__read_data(join(data_folder, train_file))
test = WordTokenizeCorpusReader.__read_data(join(data_folder, test_file))
tagged_corpus = TaggedCorpus(train, test)
return tagged_corpus

@staticmethod
def __read_data(data_file):
text = open(data_file).read()
sentences = text.split("\n")
sentences = [WordTokenizeCorpusReader.__extract_tokens(s) for s in sentences]
return sentences

@staticmethod
def __extract_tokens(s):
sentence = []
for item in s.split():
tokens = item.split("_")
for i, token in enumerate(tokens):
if i == 0:
sentence.append((token, "B-W"))
else:
sentence.append((token, "I-W"))
return sentence

class TaggedCorpus:
def __init__(self, train, test):
self.train = train
self.test = test

def downsample(self, percentage):
n = int(len(self.train) * percentage)
self.train = self.train[:n]
n = int(len(self.test) * percentage)
self.test = self.test[:n]
62 changes: 30 additions & 32 deletions egs/vlsp2013_crf/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,15 @@
from os.path import dirname
from languageflow.model.crf import CRF
from languageflow.transformer.tagged import TaggedTransformer
from egs.vlsp2013_crf.conlleval import evaluate
from egs.vlsp2013_crf.word_tokenize import CRFModel
from .load_data import load_dataset
from crf_sequence_tagger import CRFSequenceTagger
from data import WordTokenizeCorpusReader
import pycrfsuite

class TaggedCorpusFetcher:
@staticmethod
def fetch(data_folder, train_file=None, test_file=None, dev_file=None):
pass

class TaggedCorpus:
def __init__(self, train, dev, test):
pass

template = [
"T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower",

"T[-1].isdigit", "T[0].isdigit", "T[1].isdigit",

"T[-1].istitle", "T[0].istitle", "T[1].istitle",
"T[0,1].istitle", "T[0,2].istitle",

"T[-2].is_in_dict", "T[-1].is_in_dict", "T[0].is_in_dict", "T[1].is_in_dict", "T[2].is_in_dict",
"T[-2,-1].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict", "T[1,2].is_in_dict",
"T[-2,0].is_in_dict", "T[-1,1].is_in_dict", "T[0,2].is_in_dict",

# word unigram and bigram and trigram
"T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
"T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
"T[-2,0]", "T[-1,1]", "T[0,2]",
]
from trainer import Trainer


def train(train_path, model_path):
train_set = []

train_set += load_dataset(train_path)
print("Load data from file", train_path)
transformer = TaggedTransformer(template)
X, y = transformer.transform(train_set, contain_labels=True)
Expand Down Expand Up @@ -75,7 +48,8 @@ def train_test(train_path, test_path):
train(train_path, model_path)
estimator = CRFModel.instance(model_path)

test = load_dataset(test_path)
# test = load_dataset(test_path)
test = None
for sample in test:
sentence = [token[0] for token in sample]
y_test = [token[1] for token in sample]
Expand All @@ -97,3 +71,27 @@ class Args(object):

os.remove(model_path)
os.remove(output_path)

corpus = WordTokenizeCorpusReader.read("data", train_file="train.txt", test_file="test.txt")

features = [
"T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower",

"T[-1].isdigit", "T[0].isdigit", "T[1].isdigit",

"T[-1].istitle", "T[0].istitle", "T[1].istitle",
"T[0,1].istitle", "T[0,2].istitle",

"T[-2].is_in_dict", "T[-1].is_in_dict", "T[0].is_in_dict", "T[1].is_in_dict", "T[2].is_in_dict",
"T[-2,-1].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict", "T[1,2].is_in_dict",
"T[-2,0].is_in_dict", "T[-1,1].is_in_dict", "T[0,2].is_in_dict",

# word unigram and bigram and trigram
"T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
"T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
"T[-2,0]", "T[-1,1]", "T[0,2]",
]
tagger = CRFSequenceTagger(features)
trainer = Trainer(tagger, corpus)
trainer.train(c1=0.1, c2=0.01, feature=None)
print(0)
12 changes: 12 additions & 0 deletions egs/vlsp2013_crf/trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from languageflow.transformer.tagged import TaggedTransformer


class Trainer:
def __init__(self, tagger, corpus):
self.tagger = tagger
self.corpus = corpus

def train(self, c1, c2, feature):
transformer = TaggedTransformer(self.tagger.features)
X_train, y_train = transformer.transform(self.corpus.train, contain_labels=True)
pass
2 changes: 1 addition & 1 deletion util/preprocess_vlsp2013/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
folder1 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-1"
folder2 = FOLDER + "/raw/WordSegmentationTask/Trainset/Trainset-Segmentation-2"
count = 0
output_filepath = "tmp/train_dev.txt"
output_filepath = "tmp/train.txt"

if exists(output_filepath):
remove(output_filepath)
Expand Down

0 comments on commit 9fa9611

Please sign in to comment.