error handling correction and flag for headless mode (or not)

centre-for-humanities-computing · Apr 22, 2020 · 6ac8012 · 6ac8012
1 parent d5ef68b
commit 6ac8012
Show file tree

Hide file tree

Showing 6 changed files with 14 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@ node extract -h
 - **`-c, --concurrency <integer>`** [optional, default=15] - The maximum simultaneous loaded pages
 - **`-n, --no-screenshot`** [optional] - Disable screenshots
 - **`-t, --page-timeout <integer>`** [optional, default=90000] - Milliseconds to wait for the initial loading of a page
+- **`-h, --headless`** [optional, default=true] - run browser on headless mode
 - **`-i, --use-id-for-screenshot-name`** [optional] - Use an universal unique id for screenshot names instead of the url
 - **`-x, --debug`** [optional] - Print more detailed error information
 
@@ -93,6 +94,7 @@ a string in JSON-format.
         useIdForScreenshotName: {boolean} default false,
         maxConcurrency: {integer} default 15,
         pageTimeoutMs: {integer} default 90000,
+        headless: {boolean} default true,
         output: {
             screenshot: {boolean} default true,
             logs: {boolean} default true,

diff --git a/cli.js b/cli.js
@@ -15,6 +15,7 @@ const optDesc = {
     concurrency: `The maximum simultaneous loaded web pages`,
     noScreenshot: `Disable screenshots`,
     pageTimeout: `Milliseconds to wait for the initial loading of a page`,
+    headless: `run the browser in headless mode`,
     useIdForScreenshotName: `Use an universal unique id for screenshot names instead of the url`,
     debug: 'Print more detailed error information'
 
@@ -29,6 +30,7 @@ async function run() {
         cli.option('-c, --concurrency <integer>', optDesc.concurrency, WebExtractor.DEFAULT_OPTIONS.maxConcurrency);
         cli.option('-n, --no-screenshot', optDesc.noScreenshot);
         cli.option('-t, --page-timeout <integer>', optDesc.pageTimeout, WebExtractor.DEFAULT_OPTIONS.pageTimeoutMs);
+        cli.option('-h, --headless <boolean', optDesc.headless, WebExtractor.DEFAULT_OPTIONS.headless);
         cli.option('-i, --use-id-for-screenshot-name', optDesc.useIdForScreenshotName, WebExtractor.DEFAULT_OPTIONS.useIdForScreenshotName);
         cli.option('-x, --debug', optDesc.debug, false);
 
@@ -40,6 +42,7 @@ async function run() {
         let concurrency = Math.max(1, parseIntOrThrow(cli.concurrency));
         let takeScreenshot = cli.screenshot;
         let pageTimeout = Math.max(1, parseIntOrThrow(cli.pageTimeout));
+        let headless = cli.headless;
         let useIdForScreenshotName = cli.useIdForScreenshotName;
         let debug = cli.debug;
 

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@chcaa/web-extractor",
-  "version": "0.9.0",
+  "version": "0.9.1",
   "description": "Extract DOM content and take screenshots based on user defined rules",
   "main": "src/index.js",
   "scripts": {

diff --git a/src/index.js b/src/index.js
@@ -15,7 +15,7 @@ class WebExtractorApi {
      * @param destDir the destination dir for extracted data and screenshots
      * @param options additional settings see the readme file
      */
-    constructor(urls, rules, destDir, options) {
+    constructor(urls, rules, destDir, options = {}) {
         this._executed = false;
         this._urls = urls;
         this._rules = rules;

diff --git a/src/model/page-analyzer.js b/src/model/page-analyzer.js
@@ -270,7 +270,7 @@ class PageAnalyzer {
 
     async close() {
         let error;
-        if (this._page) {
+        if (this._page && !this._page.isClosed()) {
             let page = this._page;
             this._page = null;
             try {

diff --git a/src/model/web-extractor.js b/src/model/web-extractor.js
@@ -20,6 +20,7 @@ const DEFAULT_OPTIONS = Object.freeze({
     useIdForScreenshotName: false,
     maxConcurrency: 15,
     pageTimeoutMs: 90000,
+    headless: true,
     output: {
         screenshot: true,
         logs: true,
@@ -62,6 +63,7 @@ class WebExtractor {
         this._useIdForScreenshotName = options.useIdForScreenshotName;
         this._maxConcurrency = options.maxConcurrency;
         this._pageTimeout = options.pageTimeoutMs;
+        this._headless = options.headless;
         this._queue = this._createQueue();
         this._eventEmitter = new EventEmitter();
         this._closeLock = new AwaitLock();
@@ -84,7 +86,9 @@ class WebExtractor {
         }
         this._executed = true;
         // mkdir if not exists
-        await fs.mkdir(this._destDir, {recursive: true});
+        if (this._takeScreenshot || this._saveData || this._saveLogs) {
+            await fs.mkdir(this._destDir, {recursive: true});
+        }
         if (this._takeScreenshot) {
             await fs.mkdir(path.join(this._destDir, 'screenshots'), {recursive: true});
         }
@@ -319,7 +323,7 @@ class WebExtractor {
                 await this._close();
             }
             if (!this._browser) {
-                this._browser = await puppeteer.launch({headless: true, defaultViewport: {width: 1024, height: 1024},
+                this._browser = await puppeteer.launch({headless: this._headless, defaultViewport: {width: 1024, height: 1024},
                     args: []});
                 this._browser.once('disconnected', () => this._browserCloseRequired = true);
             }