-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenizer.js
122 lines (122 loc) · 4.25 KB
/
Tokenizer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import ViterbiBuilder from "./viterbi/ViterbiBuilder.js";
import ViterbiSearcher from "./viterbi/ViterbiSearcher.js";
import IpadicFormatter from "./util/IpadicFormatter.js";
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"use strict";
var PUNCTUATION = /、|。/;
/**
* Tokenizer
* @param {DynamicDictionaries} dic Dictionaries used by this tokenizer
* @constructor
*/
class Tokenizer {
constructor(dic) {
this.token_info_dictionary = dic.token_info_dictionary;
this.unknown_dictionary = dic.unknown_dictionary;
this.viterbi_builder = new ViterbiBuilder(dic);
this.viterbi_searcher = new ViterbiSearcher(dic.connection_costs);
this.formatter = new IpadicFormatter(); // TODO Other dictionaries
}
/**
* Split into sentence by punctuation
* @param {string} input Input text
* @returns {Array.<string>} Sentences end with punctuation
*/
static splitByPunctuation(input) {
var sentences = [];
var tail = input;
while (true) {
if (tail === "") {
break;
}
var index = tail.search(PUNCTUATION);
if (index < 0) {
sentences.push(tail);
break;
}
sentences.push(tail.substring(0, index + 1));
tail = tail.substring(index + 1);
}
return sentences;
}
/**
* Tokenize text
* @param {string} text Input text to analyze
* @returns {Array} Tokens
*/
tokenize(text) {
var sentences = Tokenizer.splitByPunctuation(text);
var tokens = [];
for (var i = 0; i < sentences.length; i++) {
var sentence = sentences[i];
this.tokenizeForSentence(sentence, tokens);
}
return tokens;
}
tokenizeForSentence(sentence, tokens) {
if (tokens == null) {
tokens = [];
}
var lattice = this.getLattice(sentence);
var best_path = this.viterbi_searcher.search(lattice);
var last_pos = 0;
if (tokens.length > 0) {
last_pos = tokens[tokens.length - 1].word_position;
}
for (var j = 0; j < best_path.length; j++) {
var node = best_path[j];
var token, features, features_line;
if (node.type === "KNOWN") {
features_line = this.token_info_dictionary.getFeatures(node.name);
if (features_line == null) {
features = [];
}
else {
features = features_line.split(",");
}
token = this.formatter.formatEntry(node.name, last_pos + node.start_pos, node.type, features);
}
else if (node.type === "UNKNOWN") {
// Unknown word
features_line = this.unknown_dictionary.getFeatures(node.name);
if (features_line == null) {
features = [];
}
else {
features = features_line.split(",");
}
token = this.formatter.formatUnknownEntry(node.name, last_pos + node.start_pos, node.type, features, node.surface_form);
}
else {
// TODO User dictionary
token = this.formatter.formatEntry(node.name, last_pos + node.start_pos, node.type, []);
}
tokens.push(token);
}
return tokens;
}
/**
* Build word lattice
* @param {string} text Input text to analyze
* @returns {ViterbiLattice} Word lattice
*/
getLattice(text) {
return this.viterbi_builder.build(text);
}
}
export default Tokenizer;