Skip to content

Commit

Permalink
error handling correction and flag for headless mode (or not)
Browse files Browse the repository at this point in the history
  • Loading branch information
pbvahlst committed Apr 22, 2020
1 parent d5ef68b commit 6ac8012
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 5 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ node extract -h
- **`-c, --concurrency <integer>`** [optional, default=15] - The maximum simultaneous loaded pages
- **`-n, --no-screenshot`** [optional] - Disable screenshots
- **`-t, --page-timeout <integer>`** [optional, default=90000] - Milliseconds to wait for the initial loading of a page
- **`-h, --headless`** [optional, default=true] - run browser on headless mode
- **`-i, --use-id-for-screenshot-name`** [optional] - Use an universal unique id for screenshot names instead of the url
- **`-x, --debug`** [optional] - Print more detailed error information

Expand Down Expand Up @@ -93,6 +94,7 @@ a string in JSON-format.
useIdForScreenshotName: {boolean} default false,
maxConcurrency: {integer} default 15,
pageTimeoutMs: {integer} default 90000,
headless: {boolean} default true,
output: {
screenshot: {boolean} default true,
logs: {boolean} default true,
Expand Down
3 changes: 3 additions & 0 deletions cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ const optDesc = {
concurrency: `The maximum simultaneous loaded web pages`,
noScreenshot: `Disable screenshots`,
pageTimeout: `Milliseconds to wait for the initial loading of a page`,
headless: `run the browser in headless mode`,
useIdForScreenshotName: `Use an universal unique id for screenshot names instead of the url`,
debug: 'Print more detailed error information'

Expand All @@ -29,6 +30,7 @@ async function run() {
cli.option('-c, --concurrency <integer>', optDesc.concurrency, WebExtractor.DEFAULT_OPTIONS.maxConcurrency);
cli.option('-n, --no-screenshot', optDesc.noScreenshot);
cli.option('-t, --page-timeout <integer>', optDesc.pageTimeout, WebExtractor.DEFAULT_OPTIONS.pageTimeoutMs);
cli.option('-h, --headless <boolean', optDesc.headless, WebExtractor.DEFAULT_OPTIONS.headless);
cli.option('-i, --use-id-for-screenshot-name', optDesc.useIdForScreenshotName, WebExtractor.DEFAULT_OPTIONS.useIdForScreenshotName);
cli.option('-x, --debug', optDesc.debug, false);

Expand All @@ -40,6 +42,7 @@ async function run() {
let concurrency = Math.max(1, parseIntOrThrow(cli.concurrency));
let takeScreenshot = cli.screenshot;
let pageTimeout = Math.max(1, parseIntOrThrow(cli.pageTimeout));
let headless = cli.headless;
let useIdForScreenshotName = cli.useIdForScreenshotName;
let debug = cli.debug;

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@chcaa/web-extractor",
"version": "0.9.0",
"version": "0.9.1",
"description": "Extract DOM content and take screenshots based on user defined rules",
"main": "src/index.js",
"scripts": {
Expand Down
2 changes: 1 addition & 1 deletion src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class WebExtractorApi {
* @param destDir the destination dir for extracted data and screenshots
* @param options additional settings see the readme file
*/
constructor(urls, rules, destDir, options) {
constructor(urls, rules, destDir, options = {}) {
this._executed = false;
this._urls = urls;
this._rules = rules;
Expand Down
2 changes: 1 addition & 1 deletion src/model/page-analyzer.js
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ class PageAnalyzer {

async close() {
let error;
if (this._page) {
if (this._page && !this._page.isClosed()) {
let page = this._page;
this._page = null;
try {
Expand Down
8 changes: 6 additions & 2 deletions src/model/web-extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ const DEFAULT_OPTIONS = Object.freeze({
useIdForScreenshotName: false,
maxConcurrency: 15,
pageTimeoutMs: 90000,
headless: true,
output: {
screenshot: true,
logs: true,
Expand Down Expand Up @@ -62,6 +63,7 @@ class WebExtractor {
this._useIdForScreenshotName = options.useIdForScreenshotName;
this._maxConcurrency = options.maxConcurrency;
this._pageTimeout = options.pageTimeoutMs;
this._headless = options.headless;
this._queue = this._createQueue();
this._eventEmitter = new EventEmitter();
this._closeLock = new AwaitLock();
Expand All @@ -84,7 +86,9 @@ class WebExtractor {
}
this._executed = true;
// mkdir if not exists
await fs.mkdir(this._destDir, {recursive: true});
if (this._takeScreenshot || this._saveData || this._saveLogs) {
await fs.mkdir(this._destDir, {recursive: true});
}
if (this._takeScreenshot) {
await fs.mkdir(path.join(this._destDir, 'screenshots'), {recursive: true});
}
Expand Down Expand Up @@ -319,7 +323,7 @@ class WebExtractor {
await this._close();
}
if (!this._browser) {
this._browser = await puppeteer.launch({headless: true, defaultViewport: {width: 1024, height: 1024},
this._browser = await puppeteer.launch({headless: this._headless, defaultViewport: {width: 1024, height: 1024},
args: []});
this._browser.once('disconnected', () => this._browserCloseRequired = true);
}
Expand Down

0 comments on commit 6ac8012

Please sign in to comment.