-
Notifications
You must be signed in to change notification settings - Fork 2
/
Spider.php
289 lines (250 loc) · 6.22 KB
/
Spider.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
<?php
/**
* @class Spider
* @author Robert McLeod <[email protected]>
* @date 23/04/2010
* @version 0.1b
* @copyright 2009 Robert McLeod
*/
class Spider {
protected $d;
protected $c;
protected $body;
protected $head;
protected $returnCustomDOMNodeList = true;
function __construct() {
$this->d = new DOMDocument;
$this->c = new Curl;
}
function __destruct() {
$this->reset();
}
function reset() {
$this->d->loadHTML('<html></html>');
$this->body = null;
$this->head = null;
$this->__construct();
}
/**
* Does a curl request using the curl library
* and returns this. Saves the head and
* and body to the object. Loads the body
* into the DOMDocument
*
* @param string $method POST or GET
* @param string $url The url to request
* @param array $vars Associative array of post data
*
* @return object
*/
function request( $method, $url, $vars=array() ) {
$r = ( $method == 'POST' ) ?
$this->c->post( $url, $vars ):
$this->c->get( $url );
if ( $r == false ) {
throw new Exception('Curl Error: '. $this->c->error());
}
$this->body = $r->body;
$this->head = $r->headers;
@$this->d->loadHTML( $r->body );
return $this;
}
/**
* Shortcut method for a get request
*
* @param string $url The url to get
*
* @return object
*/
function get( $url ) {
return $this->request( 'GET', $url );
}
/**
* Shortcut method for a post request
*
* @param string $url The url to post
* @param array $vars The post data
*
* @return object
*/
function post( $url, $vars ) {
return $this->request( 'POST', $url, $vars );
}
/**
* Main function for executing xpath queries
* The source argument can be a url, html string
* an array containing url to post to and array of
* post data.
*
* @param string $patt The XPath query string
* @param mixed $src The url to get, html to use, or array as above
*
* @return object
*/
private function xpath( $patt, $src ) {
if( substr( $src, 0, 4 ) == 'http' ) {
$src = $this->get( $src );
} elseif ( is_array( $src ) ) {
$src = $this->post( $src[0], $src[1]);
} elseif ( !is_null( $src ) ) {
$src = $src;
}
$x = new DOMXPath( $this->d );
$DOMNodeList = $x->query( $patt );
if ( $DOMNodeList->length == 0 ) {
throw new Exception("Xpath query does not return any results: <pre>$patt</pre>");
}
if ( $this->returnCustomDOMNodeList ) {
return new DOMNodeListWrapper( $DOMNodeList );
}
return $DOMNodeList;
}
/**
* Shortcut method for xpath. Returns an array containing
* the nodeValues for each DOMNode object returned from the
* xpath method.
*
* @param string $patt The xpath query
* @param mixed $src The source to run the query on
*
* @return object
*/
function qa( $patt, $src=null ) {
$objList = $this->xpath( $patt, $src );
$a = array();
foreach ( $objList() as $o ) {
$a[] = $o->nodeValue;
}
return $a;
}
/**
* Shortcut method for xpath. Returns DOMNodeList
*
* @param string $patt The xpath query
* @param mixed $src The source to run the query on
*
* @return object
*/
function qq( $patt, $src=null ) {
return $this->xpath( $patt, $src );
}
/**
* Shortcut method for xpath. Returns the first object
* from the DOMNodeList returned by the xpath method
*
* @param string $patt The xpath query
* @param mixed $src The source to run the query on
* @param int $i The list index to return
*
* @return object
*/
function qf( $patt, $src=null, $i=0 ) {
return $this->xpath( $patt, $src )->item($i);
}
/**
* Returns the headers from the last request, or if a
* parameter name is provided, returns that instead.
*
* @param string $param The parameter to get
*
* @return mixed
*/
function getHead( $param=null ) {
if ( is_null( $param ) ) {
return $this->head;
} else {
return $this->head[$param];
}
}
/**
* Returns the raw HTML from the last request
*
* @return string
*/
function getBody() {
return $this->body;
}
/**
* Sets the curl options via an associative array.
* Option names can be specified as done on github.com/shuber/curl
* or php.net/curl_setopt
*
* @param array $options Associative array of options
*
* @return object
*/
function setCurlOptions( $options = array() ) {
foreach ( $options as $n => $v ) {
$this->c->headers[$n] = $v;
}
return $this;
}
/**
* Sets the referer in curl
*
* @param string $r The referer
*
* @return object
*/
function setReferer( $r ) {
$this->c->referer = $r;
return $this;
}
/**
* Sets the useragent in curl
*
* @param string $ua User agent string to use
*
* @return object
*/
function setUserAgent( $ua='no one interesting...' ) {
$this->c->user_agent = $ua;
return $this;
}
/**
* Sets whether or not curl should follow redirects
*
* @param bool $b True or false
*
* @return object
*/
function followRedirects( $b ) {
$this->c->options['followlocation'] = $b;
return $this;
}
/**
* Sets whether to return a custom DOMNodeList using
* the DOMNodeListWrapper class, or a normal DOMNodeList
*
* @param bool $b True or false
*
* @return object
*/
function returnCustomDOMNodeList( $b ) {
$this->returnCustomDOMNodeList = $b;
}
/**
* Apply a template to a page. Allows you to run a template class
* against a URL/HTML and recieve the formatted results in a class
*
* @param object $template The template class
* @param mixed $url URL to get or array to POST
*
* @return object
*/
function applyTemplate( $template, $url=null ) {
// Check if we need to post or get
if ( is_array( $url ) ) { $this->post( $url[0], $url[1] ); }
elseif ( !is_null( $url ) ) { $this->get( $url ); }
// init our object to return
$o = new StdClass;
// Run through all the class variables
foreach ( get_class_vars( $template ) as $var => $patt ) {
// Process the pattern into the object with the same var name
$o->$var = $this->qf( $patt )->inner;
}
// Give it back to the template to process any vars
return $template->process( $o );
}
}
?>