-
Notifications
You must be signed in to change notification settings - Fork 15
/
grammar_refine.go
77 lines (74 loc) · 1.79 KB
/
grammar_refine.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package fmr
import (
"fmt"
"strings"
"github.com/liuzl/ling"
"github.com/liuzl/unidecode"
"github.com/mitchellh/hashstructure"
)
func (g *Grammar) refine(prefix string) error {
if g.Refined {
return nil
}
var terminalRules []*Rule
var terminals = make(map[string]string)
var names = make(map[string]bool)
var n int
var name string
for _, rule := range g.Rules {
for _, body := range rule.Body {
for _, term := range body.Terms {
if term.Type != Terminal {
continue
}
// if this is a terminal text inside a ruleBody
if t, has := terminals[term.Value]; has {
term.Value = t
} else {
d := ling.NewDocument(term.Value)
if err := NLP().Annotate(d); err != nil {
return err
}
tname := prefix + "_t"
rb := &RuleBody{}
for _, token := range d.Tokens {
if token.Type == ling.Space {
continue
}
if token.Type != ling.Punct {
ascii := unidecode.Unidecode(token.Text)
ascii = strings.Join(strings.Fields(ascii), "_")
tname += "_" + ascii
}
rb.Terms = append(rb.Terms,
&Term{Value: token.Text, Type: Terminal, Meta: term.Meta})
if gTokens.get(token.Text) == nil {
gTokens.put(token.Text, token)
}
}
for name, n = tname, 0; ; name, n =
fmt.Sprintf("%s_%d", tname, n), n+1 {
if g.Rules[name] == nil && !names[name] {
break
}
}
names[name] = true
terminals[term.Value] = name
hash, err := hashstructure.Hash(rb, nil)
if err != nil {
return err
}
terminalRules = append(terminalRules,
&Rule{name, map[uint64]*RuleBody{hash: rb}})
term.Value = name
}
term.Type = Nonterminal
}
}
}
for _, r := range terminalRules {
g.Rules[r.Name] = r
}
g.Refined = true
return nil
}