This repository has been archived by the owner on Mar 5, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 91
/
spider.js
151 lines (134 loc) · 5.4 KB
/
spider.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
var request = require('superagent');
var url = require('url');
var jsdom = require('jsdom');
var app = require('./app');
var server;
var serveraddress = "http://localhost:" + app.PORT;
var chalk = require('chalk');
var locale = 'en-US';
// Call as `node spiders.js --forceLocale to make missing locale warnings
// trigger a fail as far as npm test is concerned.
var failOnMissingLocale = process.argv.indexOf('--forceLocale') > -1;
// crawl-global "to-crawl" and "have-crawled" lists
var resolve = ['/'+locale+'/'];
var seen = {};
// tracks whether we should report success or failure for npm test purposes.
var failed = false;
// appease ESLint by preassigning these.
var getPageLinks = resolvePageLinks = () => undefined;
/**
* Run through a page for all its link elements, map them to href attributes,
* filter out any links already seen or already pending for crawling, and
* add the remainder to the "to be crawled" list. Then recurse.
* @param {string} htmlCode the HTML code associated with a specific page
* @param {function} onFinish the callback to trigger once we're done crawling
* @param {url} siteurl the URL for the page we're currently looking at
* @param {url} from the page that we navigated from to get to the current page
* @returns {undefined}
*/
resolvePageLinks = function(htmlCode, onFinish, siteurl, from) {
var dom = jsdom.env(htmlCode, function (err, window) {
// If this is an unknown not-http-status-related error, treat as error.
if (err) {
console.error(
chalk.red.bold("ERROR: request error occurred trying to access"),
chalk.yellow.bold(siteurl),
chalk.red.bold("linked from"),
chalk.yellow.bold(from)
);
failed = true;
return getPageLinks(onFinish, siteurl);
}
var document = window.document;
var links = document.querySelectorAll('a');
var linkArray = Array.from(links).map(a => a.href);
var newLinks = linkArray.filter(nexturl => (!!nexturl && nexturl.indexOf(':')===-1 && seen[nexturl] !== true && resolve.indexOf(nexturl) === -1));
// recurse with the updated to-resolve list
newLinks.forEach(nexturl => resolve.push(nexturl));
getPageLinks(onFinish, siteurl);
});
};
/**
* Get the set of <a> elements on a page, extract the 'href'
* urls, add any urls not yet visited to the "to-be-resolved" list,
* and add the current page url to the "seen" list.
*
* Do this recursively until we run out of new URLs to visit.
*
* @param {function} onFinish the callback to trigger once we're done crawling
* @param {url} from the page that we navigated from to get to the current page
* @returns {undefined}
*/
getPageLinks = function(onFinish, from) {
if (resolve.length === 0) { onFinish(); }
// make sure we respect "./abc" and "../abc" url patterns
from = from || '';
var to = resolve.splice(0,1)[0];
var siteurl = url.resolve(from, to);
// If the url is blank, that's... really weird and an error.
if (!to) {
console.error(
chalk.red.bold("Empty link found on"),
chalk.yellow.bold(from)
);
failed = true;
return getPageLinks(onFinish, from);
}
// If there is no local information that's not necessarily an error
// as the server should be able to look up the en-US resource anyway
// due to the presence of http accept languages in the request header.
if (siteurl.indexOf(locale) === -1) {
console.error(
chalk.red.bold("WARNING: no locale found for"),
chalk.yellow.bold(siteurl),
chalk.red.bold("linked from"),
chalk.yellow.bold(from)
);
failed = failed || failOnMissingLocale;
}
// So far so good: mark this url as seen and try to access it for more links
seen[to] = true;
seen[siteurl] = true;
console.log("spidering "+siteurl, from? " (from "+from+")" : '', " ("+Object.keys(seen).length+" urls crawled)");
// superagent request with headers that match what learning.mozilla.org receives
request
.get(serveraddress + siteurl)
.set('Accept-Language', 'en-US,en;q=0.8')
.set('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
.end(function(err, res){
// If this is not an HTTP-OK result, that's an error we need to fix.
if (res.status !== 200) {
console.error(
chalk.red.bold("ERROR "+res.status +": page at"),
chalk.yellow.bold(siteurl),
chalk.red.bold("linked from"),
chalk.yellow.bold(from),
chalk.red.bold("could not be found")
);
failed = true;
return getPageLinks(onFinish, siteurl);
}
// find links and recurse
resolvePageLinks(res.text, onFinish, siteurl, from);
});
};
// MAIN SCRIPT ENTRY POINT:
console.log("Starting server process...");
var spawn = require('child_process').spawn;
server = spawn('node', ['app']);
server.on('close', () => { console.log("CLOSE"); server.kill(); process.exit(1); });
server.on('exit', () => { console.log("EXIT"); server.kill(); process.exit(1); });
server.stderr.on('data', (data) => { console.error(data.toString()); });
server.stdout.on('data', (data) => {
data = data.toString();
if (data.indexOf(app.READY_STRING) > -1) {
console.log("Starting spider crawl...");
// Kick off the crawl by starting at our en-US root
// once the server writes the ready string to stdout.
getPageLinks(function onFinish() {
console.log("Finishing spider crawl" + (failed? " with errors":'') + ".");
server.kill();
process.exit(failed ? 1 : 0);
});
}
});