-
Notifications
You must be signed in to change notification settings - Fork 68
/
Copy pathutils_layout.go
92 lines (77 loc) · 2.55 KB
/
utils_layout.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
package monday
import (
"strings"
"unicode"
"unicode/utf8"
)
// dateStringLayoutItem represents one word or set of delimiters between words.
// This is an abstraction level above date raw character string of date representation.
//
// Example: "1 February / 2013" ->
// dateStringLayoutItem { item: "1", isWord: true }
// dateStringLayoutItem { item: " ", isWord: false }
// dateStringLayoutItem { item: "February", isWord: true }
// dateStringLayoutItem { item: " / ", isWord: false }
// dateStringLayoutItem { item: "2013", isWord: true }
type dateStringLayoutItem struct {
item string
isWord bool // true if this is a sequence of letters/digits (as opposed to a sequence of non-letters like delimiters)
isDigit bool // true if this is a sequence only containing digits
}
// extractLetterSequence extracts first word (sequence of letters ending with a non-letter)
// starting with the specified index and wraps it to dateStringLayoutItem according to the type
// of the word.
func extractLetterSequence(originalStr string, index int) (it dateStringLayoutItem) {
letters := &strings.Builder{}
bytesToParse := []byte(originalStr[index:])
runeCount := utf8.RuneCount(bytesToParse)
var isWord bool
var isDigit bool
letters.Grow(runeCount)
for i := 0; i < runeCount; i++ {
rne, runeSize := utf8.DecodeRune(bytesToParse)
bytesToParse = bytesToParse[runeSize:]
if i == 0 {
isWord = unicode.IsLetter(rne)
isDigit = unicode.IsDigit(rne)
} else {
if (isWord && (!unicode.IsLetter(rne) && !unicode.IsDigit(rne))) ||
(isDigit && !unicode.IsDigit(rne)) ||
(!isWord && unicode.IsLetter(rne)) ||
(!isDigit && unicode.IsDigit(rne)) {
break
}
}
letters.WriteRune(rne)
}
it.item = letters.String()
it.isWord = isWord
it.isDigit = isDigit
return
}
// stringToLayoutItems transforms raw date string (like "2 Mar 2012") into
// a set of dateStringLayoutItems, which are more convenient to work with
// in other analysis modules.
func stringToLayoutItems(dateStr string) (seqs []dateStringLayoutItem) {
i := 0
for i < len(dateStr) {
seq := extractLetterSequence(dateStr, i)
i += len(seq.item)
seqs = append(seqs, seq)
}
return
}
func layoutToString(li []dateStringLayoutItem) string {
// This function is expensive enough to be worth counting
// bytes and allocating all in one go.
numChars := 0
for _, v := range li {
numChars += len(v.item)
}
sb := &strings.Builder{}
sb.Grow(numChars)
for _, v := range li {
sb.WriteString(v.item)
}
return sb.String()
}