forked from FooSoft/jmdict
-
Notifications
You must be signed in to change notification settings - Fork 1
/
jmdict.go
260 lines (218 loc) · 10.7 KB
/
jmdict.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
package jmdict
import "io"
type Jmdict struct {
// Entries consist of kanji elements, reading elements,
// general information and sense elements. Each entry must have at
// least one reading element and one sense element. Others are optional.
Entries []JmdictEntry `xml:"entry"`
}
type JmdictEntry struct {
// A unique numeric sequence number for each entry
Sequence int `xml:"ent_seq"`
// The kanji element, or in its absence, the reading element, is
// the defining component of each entry.
// The overwhelming majority of entries will have a single kanji
// element associated with a word in Japanese. Where there are
// multiple kanji elements within an entry, they will be orthographical
// variants of the same word, either using variations in okurigana, or
// alternative and equivalent kanji. Common "mis-spellings" may be
// included, provided they are associated with appropriate information
// fields. Synonyms are not included; they may be indicated in the
// cross-reference field associated with the sense element.
Kanji []JmdictKanji `xml:"k_ele"`
// The reading element typically contains the valid readings
// of the word(s) in the kanji element using modern kanadzukai.
// Where there are multiple reading elements, they will typically be
// alternative readings of the kanji element. In the absence of a
// kanji element, i.e. in the case of a word or phrase written
// entirely in kana, these elements will define the entry.
Readings []JmdictReading `xml:"r_ele"`
// The sense element will record the translational equivalent
// of the Japanese word, plus other related information. Where there
// are several distinctly different meanings of the word, multiple
// sense elements will be employed.
Sense []JmdictSense `xml:"sense"`
}
type JmdictKanji struct {
// This element will contain a word or short phrase in Japanese
// which is written using at least one non-kana character (usually kanji,
// but can be other characters). The valid characters are
// kanji, kana, related characters such as chouon and kurikaeshi, and
// in exceptional cases, letters from other alphabets.
Expression string `xml:"keb"`
// This is a coded information field related specifically to the
// orthography of the keb, and will typically indicate some unusual
// aspect, such as okurigana irregularity.
Information []string `xml:"ke_inf"`
// This and the equivalent re_pri field are provided to record
// information about the relative priority of the entry, and consist
// of codes indicating the word appears in various references which
// can be taken as an indication of the frequency with which the word
// is used. This field is intended for use either by applications which
// want to concentrate on entries of a particular priority, or to
// generate subset files.
// The current values in this field are:
// - news1/2: appears in the "wordfreq" file compiled by Alexandre Girardi
// from the Mainichi Shimbun. (See the Monash ftp archive for a copy.)
// Words in the first 12,000 in that file are marked "news1" and words
// in the second 12,000 are marked "news2".
// - ichi1/2: appears in the "Ichimango goi bunruishuu", Senmon Kyouiku
// Publishing, Tokyo, 1998. (The entries marked "ichi2" were
// demoted from ichi1 because they were observed to have low
// frequencies in the WWW and newspapers.)
// - spec1 and spec2: a small number of words use this marker when they
// are detected as being common, but are not included in other lists.
// - gai1/2: common loanwords, based on the wordfreq file.
// - nfxx: this is an indicator of frequency-of-use ranking in the
// wordfreq file. "xx" is the number of the set of 500 words in which
// the entry can be found, with "01" assigned to the first 500, "02"
// to the second, and so on. (The entries with news1, ichi1, spec1 and
// gai1 values are marked with a "(P)" in the EDICT and EDICT2
// files.)
// The reason both the kanji and reading elements are tagged is because
// on occasions a priority is only associated with a particular
// kanji/reading pair.
Priorities []string `xml:"ke_pri"`
}
type JmdictReading struct {
// This element content is restricted to kana and related
// characters such as chouon and kurikaeshi. Kana usage will be
// consistent between the keb and reb elements; e.g. if the keb
// contains katakana, so too will the reb.
Reading string `xml:"reb"`
// This element, which will usually have a null value, indicates
// that the reb, while associated with the keb, cannot be regarded
// as a true reading of the kanji. It is typically used for words
// such as foreign place names, gairaigo which can be in kanji or
// katakana, etc.
NoKanji *string `xml:"re_nokanji"`
// This element is used to indicate when the reading only applies
// to a subset of the keb elements in the entry. In its absence, all
// readings apply to all kanji elements. The contents of this element
// must exactly match those of one of the keb elements.
Restrictions []string `xml:"re_restr"`
// General coded information pertaining to the specific reading.
// Typically it will be used to indicate some unusual aspect of
// the reading.
Information []string `xml:"re_inf"`
// See the comment on ke_pri above.
Priorities []string `xml:"re_pri"`
}
type JmdictSource struct {
Content string `xml:",chardata"`
// The xml:lang attribute defines the language(s) from which
// a loanword is drawn. It will be coded using the three-letter language
// code from the ISO 639-2 standard. When absent, the value "eng" (i.e.
// English) is the default value. The bibliographic (B) codes are used.
Language *string `xml:"lang,attr"`
// The ls_type attribute indicates whether the lsource element
// fully or partially describes the source word or phrase of the
// loanword. If absent, it will have the implied value of "full".
// Otherwise it will contain "part".
Type *string `xml:"ls_type,attr"`
// The ls_wasei attribute indicates that the Japanese word
// has been constructed from words in the source language, and
// not from an actual phrase in that language. Most commonly used to
// indicate "waseieigo".
Wasei string `xml:"ls_wasei,attr"`
}
type JmdictGlossary struct {
Content string `xml:",chardata"`
// The xml:lang attribute defines the target language of the
// gloss. It will be coded using the three-letter language code from
// the ISO 639 standard. When absent, the value "eng" (i.e. English)
// is the default value.
Language *string `xml:"lang,attr"`
// The g_gend attribute defines the gender of the gloss (typically
// a noun in the target language. When absent, the gender is either
// not relevant or has yet to be provided.
Gender *string `xml:"g_gend"`
// g_type attribute added in jmdict Rev 1.09
// At present the values used are "lit", "fig", "expl" and "tm". It is
// proposed to add a "descr" value to indicate a gloss which is a
// description of the Japanese term rather than a translation or an
// explanation of the meaning.
Type *string `xml:"g_type,attr"`
}
type JmdictSense struct {
// These elements, if present, indicate that the sense is restricted
// to the lexeme represented by the keb and/or reb.
RestrictedKanji []string `xml:"stagk"`
RestrictedReadings []string `xml:"stagr"`
// This element is used to indicate a cross-reference to another
// entry with a similar or related meaning or sense. The content of
// this element is typically a keb or reb element in another entry. In some
// cases a keb will be followed by a reb and/or a sense number to provide
// a precise target for the cross-reference. Where this happens, a JIS
// "centre-dot" (0x2126) is placed between the components of the
// cross-reference.
References []string `xml:"xref"`
// This element is used to indicate another entry which is an
// antonym of the current entry/sense. The content of this element
// must exactly match that of a keb or reb element in another entry.
Antonyms []string `xml:"ant"`
// Part-of-speech information about the entry/sense. Should use
// appropriate entity codes. In general where there are multiple senses
// in an entry, the part-of-speech of an earlier sense will apply to
// later senses unless there is a new part-of-speech indicated.
PartsOfSpeech []string `xml:"pos"`
// Information about the field of application of the entry/sense.
// When absent, general application is implied. Entity coding for
// specific fields of application.
Fields []string `xml:"field"`
// This element is used for other relevant information about
// the entry/sense. As with part-of-speech, information will usually
// apply to several senses.
Misc []string `xml:"misc"`
// This element records the information about the source
// language(s) of a loan-word/gairaigo. If the source language is other
// than English, the language is indicated by the xml:lang attribute.
// The element value (if any) is the source word or phrase.
SourceLanguages []JmdictSource `xml:"lsource"`
// For words specifically associated with regional dialects in
// Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.
Dialects []string `xml:"dial"`
// The sense-information elements provided for additional
// information to be recorded about a sense. Typical usage would
// be to indicate such things as level of currency of a sense, the
// regional variations, etc.
Information []string `xml:"s_inf"`
// Within each sense will be one or more "glosses", i.e.
// target-language words or phrases which are equivalents to the
// Japanese word. This element would normally be present, however it
// may be omitted in entries which are purely for a cross-reference.
Glossary []JmdictGlossary `xml:"gloss"`
// Some JMdict entries can contain 0 or more examples
Examples []JmdictExample `xml:"example"`
}
type JmdictExample struct {
// Each example has a Srce element that indicates the source of the example
// the source is typically the Tatoeba Project
Srce JmdictExampleSource `xml:"ex_srce"`
// The term associated with this example
Text string `xml:"ex_text"`
// Contains the Example sentences
Sentences []JmdictExampleSentence `xml:"ex_sent"`
}
type JmdictExampleSource struct {
// The id of the example for the source
ID string `xml:",chardata"`
// The source type (i.e. 'tat' for tatoeba)
SrcType string `xml:"exsrc_type,attr"`
}
type JmdictExampleSentence struct {
// The language of the example sentence
Lang string `xml:"lang,attr"`
// The example sentence text
Text string `xml:",chardata"`
}
func LoadJmdict(reader io.Reader) (Jmdict, map[string]string, error) {
var dict Jmdict
entities, err := parseDict(reader, &dict, true)
return dict, entities, err
}
func LoadJmdictNoTransform(reader io.Reader) (Jmdict, map[string]string, error) {
var dict Jmdict
entities, err := parseDict(reader, &dict, false)
return dict, entities, err
}