forked from zhangmeishan/EGN3LDG
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Alphabet.h
222 lines (194 loc) · 4.26 KB
/
Alphabet.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#ifndef _ALPHABET_
#define _ALPHABET_
#include "MyLib.h"
/*
please check to ensure that m_size not exceeds the upbound of int
*/
/*
This class serializes feature from string to int.
Index starts from 0.
*/
/**
* The basic class of quark class.
* @param std::string String class name to be used.
* @param int ID class name to be used.
* @author Naoaki Okazaki
*/
class basic_quark {
static const int max_capacity = 10000000;
protected:
typedef unordered_map<std::string, int> StringToId;
typedef std::vector<std::string> IdToString;
StringToId m_string_to_id;
IdToString m_id_to_string;
bool m_b_fixed;
int m_size;
public:
/**
* Construct.
*/
basic_quark()
{
clear();
}
/**
* Destruct.
*/
virtual ~basic_quark()
{
}
/**
* Map a string to its associated ID.
* If string-to-integer association does not exist, allocate a new ID.
* @param str String value.
* @return Associated ID for the string value.
*/
int operator[](const std::string& str)
{
StringToId::const_iterator it = m_string_to_id.find(str);
if (it != m_string_to_id.end()) {
return it->second;
}
else if (!m_b_fixed){
int newid = m_size;
m_id_to_string.push_back(str);
m_string_to_id.insert(std::pair<std::string, int>(str, newid));
m_size++;
if (m_size >= max_capacity)m_b_fixed = true;
return newid;
}
else
{
return -1;
}
}
/**
* Convert ID value into the associated string value.
* @param qid ID.
* @param def Default value if the ID was out of range.
* @return String value associated with the ID.
*/
const std::string& from_id(const int& qid, const std::string& def = "") const
{
if (qid < 0 || m_size <= qid) {
return def;
}
else {
return m_id_to_string[qid];
}
}
/**
* Convert string value into the associated ID value.
* @param str String value.
* @return ID if any, otherwise -1.
*/
int from_string(const std::string& str)
{
StringToId::const_iterator it = m_string_to_id.find(str);
if (it != m_string_to_id.end()) {
return it->second;
}
else if (!m_b_fixed){
int newid = m_size;
m_id_to_string.push_back(str);
m_string_to_id.insert(std::pair<std::string, int>(str, newid));
m_size++;
if (m_size >= max_capacity)m_b_fixed = true;
return newid;
}
else
{
return -1;
}
}
void clear()
{
m_string_to_id.clear();
m_id_to_string.clear();
m_b_fixed = false;
m_size = 0;
}
void set_fixed_flag(bool bfixed)
{
m_b_fixed = bfixed;
if (!m_b_fixed && m_size >= max_capacity){
m_b_fixed = true;
}
}
bool is_fixed() const
{
return m_b_fixed;
}
/**
* Get the number of string-to-id associations.
* @return The number of association.
*/
size_t size() const
{
return m_size;
}
void read(std::ifstream &inf)
{
clear();
static string featKey;
static int featId;
inf >> m_size;
for (int i = 0; i < m_size; ++i) {
inf >> featKey >> featId;
m_string_to_id[featKey] = i;
m_id_to_string.push_back(featKey);
assert(featId == i);
}
if (m_size > 0) {
set_fixed_flag(true);
}
}
void write(std::ofstream &outf) const
{
outf << m_size << std::endl;
for (int i = 0; i<m_size; i++)
{
outf << m_id_to_string[i] << " " << i << std::endl;
}
}
void initial(const unordered_map<string, int>& elem_stat, int cutOff = 0){
clear();
static unordered_map<string, int>::const_iterator elem_iter;
for (elem_iter = elem_stat.begin(); elem_iter != elem_stat.end(); elem_iter++) {
if (elem_iter->second > cutOff) {
from_string(elem_iter->first);
}
}
set_fixed_flag(true);
}
// initial by a file (first column), always an embedding file
void initial(const string& inFile, bool bUseUnknown = true){
clear();
static ifstream inf;
if (inf.is_open()) {
inf.close();
inf.clear();
}
inf.open(inFile.c_str());
static string strLine;
static vector<string> vecInfo;
while (1) {
if (!my_getline(inf, strLine)) {
break;
}
if (!strLine.empty()){
split_bychar(strLine, vecInfo, ' ');
from_string(vecInfo[0]);
}
}
if (bUseUnknown) {
from_string(unknownkey);
}
if (m_size > 0){
set_fixed_flag(true);
}
}
};
typedef basic_quark Alphabet;
typedef basic_quark* PAlphabet;
#endif