forked from jasonaibrahim/scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.js
170 lines (154 loc) · 6.33 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
// scraper-js 1.0.0
// http://github.com/jasonaibrahim/scraper
// (c) 2014-2015 Jason Ibrahim
// scraper may be freely distributed under the MIT license.
// global require: true, module: true
var http = require('http'),
https = require('https'),
url = require('url'),
request = require('request'),
Q = require('q'),
cheerio = require('cheerio'),
_ = require('underscore');
/** based on the url string passed in, return the proper node http module to use */
var getProtocol = function (url) {
'use strict';
// TODO this function can be safer, by checking if https or http exists. also what if the url is invalid?
if (url.slice(0, 5) === 'https') { return https; }
if (url.slice(0, 4) === 'http') { return http; }
return null;
};
/** define the node module */
module.exports.Scraper = function () {
'use strict';
var self = this;
// iterate through a list of candidate urls and return a set of only those which match jpg, png or gif
function clean (candidates) {
// TODO use a regex for checking the extname
var cleanCandidates = [], i;
for (i in candidates) {
if (httpmatch(candidates[i]) &&
(extmatch(candidates[i]) === 'jpg' ||
extmatch(candidates[i]) === 'png' ||
extmatch(candidates[i]) === 'gif')) {
cleanCandidates.push(httpmatch(candidates[i]));
}
}
return _.uniq(cleanCandidates);
}
// check if the string passed in has a filename extension and return the extension
function extmatch (path) {
if (!path) { return null; }
var extRegex = new RegExp(/.+\.([^?]+)(\?|$)/);
return path && path.match(extRegex) && path.match(extRegex)[1];
}
// given a url, return a deferred list of possible images found on that page
function getImages (url) {
var promise = Q.defer(), candidates = [];
if (clean([url]).length > 0) {
promise.resolve([url]);
return promise.promise;
}
request(url, function (error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html), imagetags = $('img'), metatags = $('meta'), i;
for (i = 0; i < imagetags.length; i += 1) {
candidates.push(imagetags[i].attribs.src);
}
for (i = 0; i < metatags.length; i += 1) {
candidates.push(metatags[i].attribs.content);
}
candidates = clean(candidates);
promise.resolve(candidates);
} else {
promise.reject(new Error('Bad Request'));
}
});
return promise.promise;
}
// prefix a string with http: and/or return the string if it matches a valid url, otherwise return null
function httpmatch (url) {
var httpRegex = new RegExp(/^(\/\/|f|ht)tps?:\/\//i);
if (url && url.indexOf("//") === 0) { return 'http:' + url; }
if (url && url.match(httpRegex) && url.match(httpRegex)[0]) { return url; }
return null;
}
// this function is designed to further determine whether a list of candidate images are considered 'good'
// as of right now, these simply return the list passed in.
function judge (finalists, address) {
var promise = Q.defer();
// TODO return only the best images in the list of finalists candidates
promise.resolve(finalists);
return promise.promise;
}
// from a list of candidate images, filter those below a certain size threshold.
function narrow (candidates) {
var promise = Q.defer(), finalists = [], seen = 0, i, j;
// helper to filter images that are smaller than 1000 bytes
function minsize (finalist) {
return finalist.size > 1000;
}
for (i = 0; i < candidates.length; i += 1) {
request({url: candidates[i], method: 'HEAD'}, function (error, response) {
if (response) {
finalists.push({
url: response.request.href,
size: response.headers['content-length']
});
}
if ((seen += 1) === candidates.length) {
finalists = finalists.filter(minsize);
finalists.sort(function (a, b) {
return b.size - a.size;
});
for (j = 0; j < finalists.length; j += 1) {
finalists[j] = finalists[j].url;
}
promise.resolve(finalists);
}
});
}
return promise.promise;
}
// break the url into as many distinct words as possible.
function parse (url) {
var wordRegex = new RegExp(/^(http|https|www|com)/),
words = url.split('/').filter(function (el) { return el !== ''; }),
parsed = [],
i,
j,
subwords;
for (i = 0; i < words.length; i += 1) {
if (!words[i].match(wordRegex)) {
subwords = words[i].split('.');
if (subwords.length > 0) {
for (j = 0; j < subwords.length; j += 1) {
if (!subwords[j].match(wordRegex)) {
parsed.push(subwords[j]);
}
}
} else {
parsed.push(words[i]);
}
}
}
return parsed;
}
// the scraper api. takes in a page url, returns a deferred list of image urls.
this.scrape = function (address) {
var deferred = Q.defer();
if (address) {
getImages(address)
.then(function (candidates) { narrow(candidates)
.then(function (finalists) { judge(finalists, address)
.then(function (medalists) {
deferred.resolve(medalists);
})
})
})}
else {
deferred.reject('Scraper cant scrape nil. Feed it a url in the params.');
}
return deferred.promise;
};
};