forked from hschaeidt/domquery
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery.go
232 lines (184 loc) · 5.42 KB
/
query.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
package main
import (
"golang.org/x/net/html"
"io"
"net/http"
"fmt"
"strings"
)
// Represents a DOM-Query
// Also represents recursively the whole query-chain
type Query struct {
tokenizer *html.Tokenizer //Contains the tokenized HTML DOM
hasPrevQuery bool
hasNextQuery bool // Has next query?
prevQuery *Query
nextQuery *Query // Next query object
match map[string]string //from the mapper some value(s)
result [][]html.Token // Contains token results from the matches, based on these the nextQuery will be executed
}
// Processing the search-term then launching the Document or Token search
func (q *Query) Find(term string) [][]html.Token {
q.ProcessSearchTerm(term)
return q.Search()
}
// Takes the decision weither to use RootSearch (DOM) or TokenSearch (List of elements)
// This method also takes care to return the results of the last (sub-)query
func (q *Query) Search() [][]html.Token {
var result [][]html.Token
if q.hasPrevQuery {
result = q.TokenSearch(q.prevQuery.result)
} else {
result = q.RootSearch()
}
if q.hasNextQuery {
result = q.nextQuery.Search()
}
return result
}
// Root search represents the search's entrypoint via the DOM
// Delegations for different search methods are made upon here
//
// Root search is only executed for the first query in the query-chain
// All subsequent searches are based on a array of previous resulted tokens
func (q *Query) RootSearch() [][]html.Token {
var finalTokens [][]html.Token
for {
// true by default
success := true
tokenType := q.tokenizer.Next()
if tokenType == html.ErrorToken {
break
}
token := q.tokenizer.Token()
success = q.Match(token, token.Type)
if success == true {
tokenChain := q.GetTokenChainFromTokenizer(token)
finalTokens = append(finalTokens, tokenChain)
}
}
q.result = finalTokens
return finalTokens
}
func (q *Query) TokenSearch(tokens [][]html.Token) [][]html.Token {
var finalTokens [][]html.Token
for _, tokenChain := range tokens {
for _, token := range tokenChain {
success := q.Match(token, token.Type)
if success == true {
//tokenChain := q.GetTokenChain(token, tokenChain)
finalTokens = append(finalTokens, tokenChain)
}
}
}
return finalTokens
}
// Checks for matches from the parsed search-terms for the given Query object
func (q *Query) Match(token html.Token, tokenType html.TokenType) bool {
success := true
for domType, domValue := range q.match {
switch {
case tokenType == html.ErrorToken:
return false
case tokenType == html.StartTagToken:
hasAttr := q.HasAttr(token, domType, domValue)
if !hasAttr {
// Attribute does not match
success = false
}
default:
success = false
}
}
return success
}
// Checks weither a (HTML) token has requested attribute matching specified value
func (q *Query) HasAttr(token html.Token, attrType string, searchValue string) bool {
for _, attr := range token.Attr {
if attr.Key == attrType && attr.Val == searchValue {
return true
}
}
return false
}
// Makes a snapshot of the whole token-chain (depth) until reaching the root again
// It takes actually the object wide tokenizer object. So each "Next()" has to be
// sended through "SearchTokens" again, in case another inner match may occure
func (q *Query) GetTokenChainFromTokenizer(rootToken html.Token) []html.Token {
var tokenChain []html.Token
depth := 1
// we expect rootToken to be a start-token, so that we can correctly measure the deepness
// of the result
if rootToken.Type != html.StartTagToken {
return nil
}
tokenChain = append(tokenChain, rootToken)
for {
tokenType := q.tokenizer.Next()
// just avoid errors
if tokenType == html.ErrorToken {
break
}
// we're digging one step deeper
if tokenType == html.StartTagToken {
depth++
}
// and one step out
if tokenType == html.EndTagToken {
depth--
}
// push new item to our chain
tokenChain = append(tokenChain, q.tokenizer.Token())
// by verifiying against smaller than zero we ensure that the loop
// makes one more turn to get the rootElements EndTagToken too
// a correct loop will always end in minus one
//
// TODO: this may cause errors by requesting self closing tags later on
if depth < 0 {
break
}
}
return tokenChain
}
// Splits the searchterm in a consecutive chain of search queries using search-maps
func (q *Query) ProcessSearchTerm(term string) {
var (
queries []string
subQuery *Query
)
// Only split into 2 args, because the next query has to handle its
// own subqueries by itself (recursion)
queries = strings.SplitN(term, " ", 2)
q.CreateSearchMap(queries[0])
// we got subselects
if len(queries) > 1 {
subQuery = new(Query)
// this will chain the recursively for each consecutive sub-query
subQuery.CreateSearchMap(queries[1])
}
}
func (q *Query) CreateSearchMap(query string) {
if q.match == nil {
q.match = make(map[string]string)
}
if strings.HasPrefix(query, ".") {
q.match["class"] = strings.TrimPrefix(query, ".")
} else if strings.HasPrefix(query, "#") {
q.match["id"] = strings.TrimPrefix(query, "#")
}
}
// Loads the reader's input into tokenized HTML.
// It can be used to iterate through, finding / changing values.
func (q *Query) Load(reader io.Reader) {
q.tokenizer = html.NewTokenizer(reader);
}
func main() {
resp, err := http.Get("https://www.google.de/")
if err == nil {
q := new(Query)
q.Load(resp.Body)
result := q.Find(".gb1")
fmt.Println(result)
defer resp.Body.Close()
}
}