Skip to content

Commit

Permalink
fix: be more lenient when decompressing (#64)
Browse files Browse the repository at this point in the history
  • Loading branch information
szmarczak authored Jan 10, 2022
1 parent f5961f9 commit 07ea3b4
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 1 deletion.
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "got-scraping",
"version": "3.2.7",
"version": "3.2.8",
"description": "HTTP client made for scraping based on got.",
"main": "dist/index.js",
"engines": {
Expand All @@ -13,6 +13,7 @@
"got-cjs": "12.0.1",
"header-generator": "^1.1.1",
"http2-wrapper": "^2.1.4",
"mimic-response": "^3.1.0",
"ow": "^0.23.0",
"quick-lru": "^5.1.1",
"tslib": "^2.3.1"
Expand Down
8 changes: 8 additions & 0 deletions src/hooks/browser-headers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ export async function browserHeadersHook(options: Options): Promise<void> {
});
}

if (!options.decompress) {
for (const key of Object.keys(generatedHeaders)) {
if (key.toLowerCase() === 'accept-encoding') {
delete generatedHeaders[key];
}
}
}

// TODO: Use `options.merge({headers: generatedHeaders})` instead
options.headers = mergeHeaders(generatedHeaders, options.headers as Record<string, string>);
}
95 changes: 95 additions & 0 deletions src/hooks/fix-decompress.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import zlib from 'zlib';
import { ClientRequest, IncomingMessage } from 'http';
import { HandlerFunction } from 'got-cjs';
import { PassThrough, Transform } from 'stream';
import mimicResponse from 'mimic-response';

const onResponse = (response: IncomingMessage, propagate: (fixedResponse: IncomingMessage) => void) => {
const encoding = response.headers['content-encoding']?.toLowerCase();

// Append empty chunk.
const zlibOptions = {
flush: zlib.constants.Z_SYNC_FLUSH,
finishFlush: zlib.constants.Z_SYNC_FLUSH,
};

const useDecompressor = (decompressor: Transform) => {
delete response.headers['content-encoding'];

const result = new PassThrough({
autoDestroy: false,
destroy(error, callback) {
response.destroy();

callback(error);
},
});

decompressor.once('error', (error) => {
result.destroy(error);
});

response.pipe(decompressor).pipe(result);

propagate(mimicResponse(response, result));
};

if (encoding === 'gzip' || encoding === 'x-gzip') {
useDecompressor(zlib.createGunzip(zlibOptions));
} else if (encoding === 'deflate' || encoding === 'x-deflate') {
let read = false;

response.once('data', (chunk: Buffer) => {
read = true;

response.unshift(chunk);

// See http://stackoverflow.com/questions/37519828
// eslint-disable-next-line no-bitwise
const decompressor = (chunk[0] & 0x0F) === 0x08 ? zlib.createInflate() : zlib.createInflateRaw();
useDecompressor(decompressor);
});

response.once('end', () => {
if (!read) {
propagate(response);
}
});
} else if (encoding === 'br') {
useDecompressor(zlib.createBrotliDecompress());
} else {
propagate(response);
}
};

// Some websites incorrectly compress the response.
// Got is very strict so it would throw.
// Browsers don't, so we need fix this.
export const fixDecompress: HandlerFunction = (options, next) => {
const result = next(options);

// @ts-expect-error Looks like a TypeScript bug
result.on('request', (request: ClientRequest) => {
const emit = request.emit.bind(request);

request.emit = (event: string, ...args: unknown[]) => {
// It won't double decompress, because Got checks the content-encoding header.
// We delete it if the response is compressed.
if (event === 'response' && options.decompress) {
const response = args[0] as IncomingMessage;

const emitted = request.listenerCount('response') !== 0;

onResponse(response, (fixedResponse: IncomingMessage) => {
emit('response', fixedResponse);
});

return emitted;
}

return emit(event, ...args);
};
});

return result;
};
4 changes: 4 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@ import { http2Hook } from './hooks/http2';
import { insecureParserHook } from './hooks/insecure-parser';
import { tlsHook } from './hooks/tls';
import { sessionDataHook } from './hooks/storage';
import { fixDecompress } from './hooks/fix-decompress';

const gotScraping = gotCjs.extend({
handlers: [
fixDecompress,
],
mutableDefaults: true,
// Most of the new browsers use HTTP/2
http2: true,
Expand Down
6 changes: 6 additions & 0 deletions test/helpers/dummy-server.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Server } from 'http';
import zlib from 'zlib';
import express, { Express } from 'express';
import bodyParser from 'body-parser';

Expand Down Expand Up @@ -57,6 +58,11 @@ const startDummyServer = async (port = 0): Promise<Server> => {
res.send(req.url.slice(req.url.indexOf('?') + 1));
});

app.get('/invalid-deflate', (_req, res) => {
res.setHeader('content-encoding', 'deflate');
res.send(zlib.deflateRawSync('ok'));
});

return startExpressAppPromise(app, port);
};

Expand Down
5 changes: 5 additions & 0 deletions test/main.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,11 @@ describe('GotScraping', () => {
});
});

test('is lenient on decompression', async () => {
const response = await gotScraping.get(`http://localhost:${port}/invalid-deflate`);
expect(response.body).toBe('ok');
});

describe('same thing with streams', () => {
test('should order headers', async () => {
const body = await getStream(gotScraping.stream({ url: 'https://api.apify.com/v2/browser-info?rawHeaders=1' }));
Expand Down

0 comments on commit 07ea3b4

Please sign in to comment.