-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathregularuse.go
148 lines (130 loc) · 3.97 KB
/
regularuse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package kanji
import (
"strings"
"unicode"
"unicode/utf8"
)
// IsRegularUse returns true if the rune r is in the regular-use list (常用漢字表).
func IsRegularUse(r rune) bool {
return unicode.In(r, StandardRangeTable, OldFormRangeTable, TolerableRangeTable)
}
// IsStandardRegularUse returns true if the rune r is in the regular-use standard list (常用漢字表:標準字体).
func IsStandardRegularUse(r rune) bool {
return unicode.Is(StandardRangeTable, r)
}
// IsOldFormRegularUse returns true if the rune r is in the regular-use old form list (常用漢字表:旧字体).
func IsOldFormRegularUse(r rune) bool {
return unicode.Is(OldFormRangeTable, r)
}
// IsTolerableRegularUse returns true if the rune r is in the regular-use old form list (常用漢字表:許容字体).
func IsTolerableRegularUse(r rune) bool {
return unicode.Is(TolerableRangeTable, r)
}
// IsNotRegularUse returns true if the rune r is in the `unicode.Han` and is not in the regular use list (常用漢字表).
func IsNotRegularUse(r rune) bool {
return unicode.Is(unicode.Han, r) && !IsRegularUse(r)
}
// ReplaceNotRegularUseAll returns a copy of the string s with each run of not in regular-use kanji
// replaced by the replacement string, which may be empty.
func ReplaceNotRegularUseAll(s, replacement string) string {
return replace(s, replacement, func(r rune) bool { return !IsNotRegularUse(r) })
}
// Option represents an option for the discriminator.
type Option func(d *RegularUseDiscriminator)
// Allow is a discriminator option to set allow characters.
func Allow(r ...rune) Option {
return func(d *RegularUseDiscriminator) {
d.allow = append(d.allow, r...)
}
}
// Disallow is a discriminator option to set disallow characters.
func Disallow(r ...rune) Option {
return func(d *RegularUseDiscriminator) {
d.disallow = append(d.disallow, r...)
}
}
// RegularUseDiscriminator determines if a character is in regular-use kanji or allowed/disallowed character.
type RegularUseDiscriminator struct {
allow []rune
disallow []rune
}
// NewRegularUseDiscriminator returns a regular kanji character discriminator.
func NewRegularUseDiscriminator(options ...Option) *RegularUseDiscriminator {
var ret RegularUseDiscriminator
for _, option := range options {
option(&ret)
}
return &ret
}
// IsNotRegularUse returns true if the rune r is in disallowed characters, else if return false r is in allowed characters,
// otherwise whether r is not in regular-use kanji.
func (d *RegularUseDiscriminator) IsNotRegularUse(r rune) bool {
for _, v := range d.allow {
if v == r {
return false
}
}
for _, v := range d.disallow {
if v == r {
return true
}
}
return IsNotRegularUse(r)
}
// ReplaceNotRegularUseAll returns a copy of the string s with each run of not in regular-use kanji
// replaced by the replacement string, which may be empty.
func (d *RegularUseDiscriminator) ReplaceNotRegularUseAll(s, replacement string) string {
return replace(s, replacement, func(r rune) bool { return !d.IsNotRegularUse(r) })
}
func replace(s, replacement string, is func(rune) bool) string {
var b strings.Builder
for i, c := range s {
if c != utf8.RuneError && is(c) {
continue
}
if !is(c) {
b.Grow(len(s) + len(replacement))
b.WriteString(s[:i])
s = s[i:]
break
}
_, wid := utf8.DecodeRuneInString(s[i:])
if wid == 1 {
b.Grow(len(s) + len(replacement))
b.WriteString(s[:i])
s = s[i:]
break
}
}
// Fast path for unchanged input
if b.Cap() == 0 { // didn't call b.Grow above
return s
}
invalid := false // previous byte was from an invalid UTF-8 sequence
for i := 0; i < len(s); {
c := s[i]
if c < utf8.RuneSelf {
i++
invalid = false
b.WriteByte(c)
continue
}
r, wid := utf8.DecodeRuneInString(s[i:])
if wid == 1 {
i++
if !invalid {
invalid = true
b.WriteString(replacement)
}
continue
}
invalid = false
if is(r) {
b.WriteString(s[i : i+wid])
} else {
b.WriteString(replacement)
}
i += wid
}
return b.String()
}