diff --git a/package-lock.json b/package-lock.json index f413aab..b735ffd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "got-scraping", - "version": "3.2.16", + "version": "4.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "got-scraping", - "version": "3.2.16", + "version": "4.0.0", "license": "Apache-2.0", "dependencies": { "got": "^13.0.0", diff --git a/package.json b/package.json index 8aecc68..5e4c184 100644 --- a/package.json +++ b/package.json @@ -53,7 +53,8 @@ "prepublishOnly": "npm run build", "lint": "eslint src test", "lint:fix": "eslint src test --fix", - "test": "node --experimental-vm-modules ./node_modules/jest/bin/jest.js --coverage" + "test": "node --experimental-vm-modules ./node_modules/jest/bin/jest.js --coverage", + "test-blocking": "ts-node -T ./test/live-testing/index.js" }, "author": { "name": "Apify", diff --git a/test/live-testing/index.js b/test/live-testing/index.js new file mode 100644 index 0000000..915876f --- /dev/null +++ b/test/live-testing/index.js @@ -0,0 +1,74 @@ +import { readFileSync } from 'fs'; +import { setTimeout } from 'timers/promises'; +import { gotScraping } from '../../dist/index.js'; +import got from 'got'; + +async function processUrls(gotImplementation, urls) { + let passed = 0; + let blocked = 0; + let failed = 0; + + let url = null; + while (url = urls.shift()) { + try { + //console.log(`crawling ${url}`); + const request = gotImplementation.get(url); + + const result = await Promise.race([ + request, + setTimeout(5000), + ]); + + if (!result?.body) { + request.cancel(); + throw new Error('timeout'); + } + + //console.log(`crawled ${url}`); + + if (result.body.includes('Just a moment...')) { + blocked++; + } else { + passed++; + } + } catch (e) { + failed++; + //console.error(e.message); + continue; + } + } + + //console.log('done!'); + return { passed, blocked, failed }; +} + +async function runInParallel(implementation, urls) { + const localUrls = [...urls]; + const partialResults = await Promise.all(Array.from({ length: 5 }, () => processUrls(implementation, localUrls))); + + return partialResults.reduce((acc, { passed, blocked, failed }) => { + acc.passed += passed; + acc.blocked += blocked; + acc.failed += failed; + return acc; + }, {passed: 0, blocked: 0, failed: 0}); +} + +(async () => { + const { body } = await got.get('https://raw.githubusercontent.com/apify/fingerprint-suite/master/test/antibot-services/live-testing/cloudflare-websites.csv') + const urls = body.split('\n'); + + const [gotScrapingResults, gotResults] = await Promise.all([ + runInParallel(gotScraping, urls), + runInParallel(got, urls), + ]); + + console.log('got-scraping'); + console.log(gotScrapingResults); + + console.log('---'); + console.log('got'); + console.log(gotResults); + + process.exit(0); +})();