Skip to content
This repository has been archived by the owner on Feb 15, 2023. It is now read-only.

Commit

Permalink
GH-22: update train
Browse files Browse the repository at this point in the history
  • Loading branch information
rain1024 committed Dec 26, 2018
1 parent aae7cfe commit 767cb64
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 81 deletions.
1 change: 1 addition & 0 deletions egs/vlsp2013_crf/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __extract_tokens(s):
sentence = []
for item in s.split():
tokens = item.split("_")
tokens = [token for token in tokens if token]
for i, token in enumerate(tokens):
if i == 0:
sentence.append((token, "B-W"))
Expand Down
88 changes: 9 additions & 79 deletions egs/vlsp2013_crf/train.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,22 @@
import os
from os import makedirs
from os.path import dirname
from languageflow.model.crf import CRF
from languageflow.transformer.tagged import TaggedTransformer
from crf_sequence_tagger import CRFSequenceTagger
from data import WordTokenizeCorpusReader
import pycrfsuite

from trainer import Trainer


def train(train_path, model_path):
train_set = []
print("Load data from file", train_path)
transformer = TaggedTransformer(template)
X, y = transformer.transform(train_set, contain_labels=True)

# train
params = {
'c1': 1.0, # coefficient for L1 penalty
'c2': 1e-3, # coefficient for L2 penalty
'max_iterations': 1000, #
# include transitions that are possible, but not observed
'feature.possible_transitions': True
}
folder = dirname(model_path)
try:
makedirs(folder)
except:
pass
estimator = CRF(params=params, filename=model_path)
estimator.fit(X, y)


def _remove_file(output_path):
try:
os.remove(output_path)
except:
pass


def train_test(train_path, test_path):
model_path = "model.tmp.bin"
output_path = "output.txt"
_remove_file(output_path)
output = open(output_path, "a")
train(train_path, model_path)
estimator = CRFModel.instance(model_path)

# test = load_dataset(test_path)
test = None
for sample in test:
sentence = [token[0] for token in sample]
y_test = [token[1] for token in sample]
y_pred = estimator.predict(sentence)
for i in range(len(y_test)):
line = "{}\t{}\t{}\n".format(y_pred[i][0], y_test[i], y_pred[i][1])
output.write(line)
output.write("\n")

class Args(object):
pass

args = Args()
args.latex = False
args.raw = False
args.delimiter = None
args.oTag = "O"
evaluate(open(output_path), args)

os.remove(model_path)
os.remove(output_path)

corpus = WordTokenizeCorpusReader.read("data", train_file="train.txt", test_file="test.txt").downsample(0.1)
corpus = WordTokenizeCorpusReader.read("data", train_file="train.txt", test_file="test.txt")

features = [
"T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower",

"T[-1].isdigit", "T[0].isdigit", "T[1].isdigit",

"T[-1].istitle", "T[0].istitle", "T[1].istitle",
"T[0,1].istitle", "T[0,2].istitle",

"T[-2].is_in_dict", "T[-1].is_in_dict", "T[0].is_in_dict", "T[1].is_in_dict", "T[2].is_in_dict",
"T[-2,-1].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict", "T[1,2].is_in_dict",
"T[-2,0].is_in_dict", "T[-1,1].is_in_dict", "T[0,2].is_in_dict",
# "T[-1].isdigit", "T[0].isdigit", "T[1].isdigit",
#
# "T[-1].istitle", "T[0].istitle", "T[1].istitle",
# "T[0,1].istitle", "T[0,2].istitle",

# "T[-2].is_in_dict", "T[-1].is_in_dict", "T[0].is_in_dict", "T[1].is_in_dict", "T[2].is_in_dict",
# "T[-2,-1].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict", "T[1,2].is_in_dict",
# "T[-2,0].is_in_dict", "T[-1,1].is_in_dict", "T[0,2].is_in_dict",
"T[-2,-1].lower", "T[-1,0].lower", "T[0,1].lower", "T[1,2].lower",
# word unigram and bigram and trigram
"T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
"T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
Expand All @@ -94,4 +25,3 @@ class Args(object):
tagger = CRFSequenceTagger(features)
trainer = Trainer(tagger, corpus)
trainer.train(c1=0.1, c2=0.01, feature=None)
print(0)
4 changes: 2 additions & 2 deletions egs/vlsp2013_crf/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def train(self, c1, c2, feature):
params = {
'c1': 1.0, # coefficient for L1 penalty
'c2': 1e-3, # coefficient for L2 penalty
'max_iterations': 200, #
'max_iterations': 1000, #
# include transitions that are possible, but not observed
'feature.possible_transitions': True
}
Expand All @@ -45,7 +45,7 @@ def train(self, c1, c2, feature):
logger.info("Start tagger")
tagger = pycrfsuite.Tagger()
tagger.open(filename)
y_pred = [tagger.tag(xseq) for x_seq in X_test]
y_pred = [tagger.tag(x_seq) for x_seq in X_test]
sentences = [[item[0] for item in sentence] for sentence in self.corpus.test]
sentences = zip(sentences, y_test, y_pred)
texts = []
Expand Down

0 comments on commit 767cb64

Please sign in to comment.