diff --git a/package.json b/package.json index 3c29e65..4f13beb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "got-scraping", - "version": "3.2.7", + "version": "3.2.8", "description": "HTTP client made for scraping based on got.", "main": "dist/index.js", "engines": { @@ -13,6 +13,7 @@ "got-cjs": "12.0.1", "header-generator": "^1.1.1", "http2-wrapper": "^2.1.4", + "mimic-response": "^3.1.0", "ow": "^0.23.0", "quick-lru": "^5.1.1", "tslib": "^2.3.1" diff --git a/src/hooks/browser-headers.ts b/src/hooks/browser-headers.ts index 99149fe..7386314 100644 --- a/src/hooks/browser-headers.ts +++ b/src/hooks/browser-headers.ts @@ -99,6 +99,14 @@ export async function browserHeadersHook(options: Options): Promise { }); } + if (!options.decompress) { + for (const key of Object.keys(generatedHeaders)) { + if (key.toLowerCase() === 'accept-encoding') { + delete generatedHeaders[key]; + } + } + } + // TODO: Use `options.merge({headers: generatedHeaders})` instead options.headers = mergeHeaders(generatedHeaders, options.headers as Record); } diff --git a/src/hooks/fix-decompress.ts b/src/hooks/fix-decompress.ts new file mode 100644 index 0000000..766a151 --- /dev/null +++ b/src/hooks/fix-decompress.ts @@ -0,0 +1,95 @@ +import zlib from 'zlib'; +import { ClientRequest, IncomingMessage } from 'http'; +import { HandlerFunction } from 'got-cjs'; +import { PassThrough, Transform } from 'stream'; +import mimicResponse from 'mimic-response'; + +const onResponse = (response: IncomingMessage, propagate: (fixedResponse: IncomingMessage) => void) => { + const encoding = response.headers['content-encoding']?.toLowerCase(); + + // Append empty chunk. + const zlibOptions = { + flush: zlib.constants.Z_SYNC_FLUSH, + finishFlush: zlib.constants.Z_SYNC_FLUSH, + }; + + const useDecompressor = (decompressor: Transform) => { + delete response.headers['content-encoding']; + + const result = new PassThrough({ + autoDestroy: false, + destroy(error, callback) { + response.destroy(); + + callback(error); + }, + }); + + decompressor.once('error', (error) => { + result.destroy(error); + }); + + response.pipe(decompressor).pipe(result); + + propagate(mimicResponse(response, result)); + }; + + if (encoding === 'gzip' || encoding === 'x-gzip') { + useDecompressor(zlib.createGunzip(zlibOptions)); + } else if (encoding === 'deflate' || encoding === 'x-deflate') { + let read = false; + + response.once('data', (chunk: Buffer) => { + read = true; + + response.unshift(chunk); + + // See http://stackoverflow.com/questions/37519828 + // eslint-disable-next-line no-bitwise + const decompressor = (chunk[0] & 0x0F) === 0x08 ? zlib.createInflate() : zlib.createInflateRaw(); + useDecompressor(decompressor); + }); + + response.once('end', () => { + if (!read) { + propagate(response); + } + }); + } else if (encoding === 'br') { + useDecompressor(zlib.createBrotliDecompress()); + } else { + propagate(response); + } +}; + +// Some websites incorrectly compress the response. +// Got is very strict so it would throw. +// Browsers don't, so we need fix this. +export const fixDecompress: HandlerFunction = (options, next) => { + const result = next(options); + + // @ts-expect-error Looks like a TypeScript bug + result.on('request', (request: ClientRequest) => { + const emit = request.emit.bind(request); + + request.emit = (event: string, ...args: unknown[]) => { + // It won't double decompress, because Got checks the content-encoding header. + // We delete it if the response is compressed. + if (event === 'response' && options.decompress) { + const response = args[0] as IncomingMessage; + + const emitted = request.listenerCount('response') !== 0; + + onResponse(response, (fixedResponse: IncomingMessage) => { + emit('response', fixedResponse); + }); + + return emitted; + } + + return emit(event, ...args); + }; + }); + + return result; +}; diff --git a/src/index.ts b/src/index.ts index 2744cf2..295e68a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,8 +16,12 @@ import { http2Hook } from './hooks/http2'; import { insecureParserHook } from './hooks/insecure-parser'; import { tlsHook } from './hooks/tls'; import { sessionDataHook } from './hooks/storage'; +import { fixDecompress } from './hooks/fix-decompress'; const gotScraping = gotCjs.extend({ + handlers: [ + fixDecompress, + ], mutableDefaults: true, // Most of the new browsers use HTTP/2 http2: true, diff --git a/test/helpers/dummy-server.ts b/test/helpers/dummy-server.ts index 1b61fcc..d60f25d 100644 --- a/test/helpers/dummy-server.ts +++ b/test/helpers/dummy-server.ts @@ -1,4 +1,5 @@ import { Server } from 'http'; +import zlib from 'zlib'; import express, { Express } from 'express'; import bodyParser from 'body-parser'; @@ -57,6 +58,11 @@ const startDummyServer = async (port = 0): Promise => { res.send(req.url.slice(req.url.indexOf('?') + 1)); }); + app.get('/invalid-deflate', (_req, res) => { + res.setHeader('content-encoding', 'deflate'); + res.send(zlib.deflateRawSync('ok')); + }); + return startExpressAppPromise(app, port); }; diff --git a/test/main.test.ts b/test/main.test.ts index f050c28..1f1b768 100644 --- a/test/main.test.ts +++ b/test/main.test.ts @@ -338,6 +338,11 @@ describe('GotScraping', () => { }); }); + test('is lenient on decompression', async () => { + const response = await gotScraping.get(`http://localhost:${port}/invalid-deflate`); + expect(response.body).toBe('ok'); + }); + describe('same thing with streams', () => { test('should order headers', async () => { const body = await getStream(gotScraping.stream({ url: 'https://api.apify.com/v2/browser-info?rawHeaders=1' }));