-
Notifications
You must be signed in to change notification settings - Fork 2
/
cli.js
114 lines (94 loc) · 4.3 KB
/
cli.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import fs from 'fs/promises';
import { WebExtractor } from './src/model/web-extractor.js';
import path from 'path';
import * as urlUtil from './src/util/url-util.js';
import * as ruleUtil from './src/util/rule-util.js';
import cli from 'commander';
import config from './src/config.js';
import _ from 'lodash';
import * as slg from 'single-line-log';
const singleLineLog = slg.stdout;
const optDesc = {
urls: `A path to a file with a list of urls for extraction. Each url should be on it's own line`,
destination: `A path to the dir where data should be saved. If the dir already contains previous collected data the new data will be appended to the existing files`,
rules: `A path to the dir where extraction rules are located. If not set the "rules" folder in project will be used as default`,
concurrency: `The maximum simultaneous loaded web pages`,
noScreenshot: `Disable screenshots`,
pageTimeout: `Milliseconds to wait for the initial loading of a page`,
headless: `run the browser in headless mode`,
useIdForScreenshotName: `Use an universal unique id for screenshot names instead of the url`,
debug: 'Print more detailed error information'
};
async function run() {
try {
cli.requiredOption('-u, --urls <file>', optDesc.urls);
cli.requiredOption('-d, --destination <directory>', optDesc.destination);
cli.option('-r, --rules <directory>', optDesc.rules);
cli.option('-c, --concurrency <integer>', optDesc.concurrency, WebExtractor.DEFAULT_OPTIONS.maxConcurrency);
cli.option('-n, --no-screenshot', optDesc.noScreenshot);
cli.option('-t, --page-timeout <integer>', optDesc.pageTimeout, WebExtractor.DEFAULT_OPTIONS.pageTimeoutMs);
cli.option('-h, --headless <boolean', optDesc.headless, WebExtractor.DEFAULT_OPTIONS.headless);
cli.option('-i, --use-id-for-screenshot-name', optDesc.useIdForScreenshotName, WebExtractor.DEFAULT_OPTIONS.useIdForScreenshotName);
cli.option('-x, --debug', optDesc.debug, false);
cli.parse(process.argv);
let urlsPath = cli.urls;
let destDir = cli.destination;
let rulesDir = cli.rules;
let concurrency = Math.max(1, parseIntOrThrow(cli.concurrency));
let takeScreenshot = cli.screenshot;
let pageTimeout = Math.max(1, parseIntOrThrow(cli.pageTimeout));
let headless = cli.headless;
let useIdForScreenshotName = cli.useIdForScreenshotName;
let debug = cli.debug;
config.debug = debug;
if (!rulesDir) {
rulesDir = path.join(__dirname, 'rules');
}
let rules = await ruleUtil.loadRules(rulesDir);
let urls = await urlUtil.getUrls(urlsPath);
if (concurrency > 10) {
process.setMaxListeners(concurrency + 10); // prevent warning caused by puppeteer registering listeners for each instance
}
let options = {
output: {
screenshot: takeScreenshot
},
maxConcurrency: concurrency,
pageTimeoutMs: pageTimeout,
useIdForScreenshotName: useIdForScreenshotName,
ruleInitOptions: { destDir }
};
let start = Date.now();
let webExtractor = new WebExtractor(urls, rules, destDir, options);
webExtractor.addProgressionListener((progress) => {
let line = `pending: ${progress.pending}, completed: ${progress.completed}, failed: ${progress.failed}, total: ${progress.total}\n`;
singleLineLog(line);
});
await webExtractor.execute();
console.log(`done... (elapsed: ${elapsedTime(start)})`);
} catch(e) {
if (config.debug) {
console.log(e);
} else {
console.error(e.message);
}
}
}
function parseIntOrThrow(str) {
let res = parseInt(str);
if (!_.isSafeInteger(res)) {
throw new Error(`Could not parse: ${str}`);
}
return res;
}
function elapsedTime(startTime) {
let elapsed = Date.now() - startTime;
let secMs = 1000;
let minMs = 60 * secMs;
let hoursMs = 60 * minMs;
let hours = Math.floor(elapsed / hoursMs);
let minutes = Math.floor((elapsed - hours * hoursMs) / minMs);
let secs = Math.floor((elapsed - (hoursMs * hours + minutes * minMs)) / secMs);
return `${hours}h, ${minutes}m, ${secs}s`;
}
run();