Skip to content

Commit

Permalink
Separate confident and imprecise detectors, introduce detector identi…
Browse files Browse the repository at this point in the history
…fication (#717)

Co-authored-by: Sindre Sorhus <[email protected]>
  • Loading branch information
Borewit and sindresorhus authored Jan 7, 2025
1 parent 4db407d commit 356bce8
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 56 deletions.
5 changes: 4 additions & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ If you're adding support for a new file type, please follow the below steps:
- Add a fixture file named `fixture.<extension>` to the `fixture` directory.
- Add the file extension to the `extensions` array in `supported.js`.
- Add the file's MIME type to the `types` array in `supported.js`.
- Add the file type detection logic to the `core.js` file
- Add the file type detection logic to the `core.js` file.
- Determine the appropriate detection confidence category:
- `detectConfident()`: Detections with a high degree of certainty in identifying the correct file type.
- `detectImprecise()`: Detections with limited supporting data, resulting in a higher likelihood of false positives.
- Respect the sequence:
- Signature with shorter sample size (counted from offset 0 until the last required byte position) will be executed first.
- Only the initial determination for the file type counts for the sequence.
Expand Down
5 changes: 4 additions & 1 deletion core.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,10 @@ console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
@param fileType - The file type detected by standard or previous custom detectors, or `undefined` if no match is found.
@returns The detected file type, or `undefined` if no match is found.
*/
export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
export type Detector = {
id: string;
detect: (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise<FileTypeResult | undefined>;
};

export type FileTypeOptions = {
customDetectors?: Iterable<Detector>;
Expand Down
83 changes: 46 additions & 37 deletions core.js
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,9 @@ export async function fileTypeStream(webStream, options) {

export class FileTypeParser {
constructor(options) {
this.detectors = [...(options?.customDetectors ?? []), this.parse];
this.detectors = [...(options?.customDetectors ?? []),
{id: 'core', detect: this.detectConfident},
{id: 'core.imprecise', detect: this.detectImprecise}];
this.tokenizerOptions = {
abortSignal: options?.signal,
};
Expand All @@ -165,7 +167,7 @@ export class FileTypeParser {

// Iterate through all file-type detectors
for (const detector of this.detectors) {
const fileType = await detector(tokenizer);
const fileType = await detector.detect(tokenizer);
if (fileType) {
return fileType;
}
Expand Down Expand Up @@ -256,7 +258,8 @@ export class FileTypeParser {
return this.check(stringToBytes(header), options);
}

parse = async tokenizer => {
// Detections with a high degree of certainty in identifying the correct file type
detectConfident = async tokenizer => {
this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);

// Keep reading until EOF if the file size is unknown.
Expand Down Expand Up @@ -346,7 +349,7 @@ export class FileTypeParser {
if (this.check([0xEF, 0xBB, 0xBF])) { // UTF-8-BOM
// Strip off UTF-8-BOM
this.tokenizer.ignore(3);
return this.parse(tokenizer);
return this.detectConfident(tokenizer);
}

if (this.check([0x47, 0x49, 0x46])) {
Expand Down Expand Up @@ -1406,39 +1409,6 @@ export class FileTypeParser {
return undefined; // Some unknown text based format
}

// -- Unsafe signatures --

if (
this.check([0x0, 0x0, 0x1, 0xBA])
|| this.check([0x0, 0x0, 0x1, 0xB3])
) {
return {
ext: 'mpg',
mime: 'video/mpeg',
};
}

if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
return {
ext: 'ttf',
mime: 'font/ttf',
};
}

if (this.check([0x00, 0x00, 0x01, 0x00])) {
return {
ext: 'ico',
mime: 'image/x-icon',
};
}

if (this.check([0x00, 0x00, 0x02, 0x00])) {
return {
ext: 'cur',
mime: 'image/x-icon',
};
}

if (this.check([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1])) {
// Detected Microsoft Compound File Binary File (MS-CFB) Format.
return {
Expand Down Expand Up @@ -1644,6 +1614,45 @@ export class FileTypeParser {
mime: 'application/pgp-encrypted',
};
}
};

// Detections with limited supporting data, resulting in a higher likelihood of false positives
detectImprecise = async tokenizer => {
this.buffer = new Uint8Array(reasonableDetectionSizeInBytes);

// Read initial sample size of 8 bytes
await tokenizer.peekBuffer(this.buffer, {length: Math.min(8, tokenizer.fileInfo.size), mayBeLess: true});

if (
this.check([0x0, 0x0, 0x1, 0xBA])
|| this.check([0x0, 0x0, 0x1, 0xB3])
) {
return {
ext: 'mpg',
mime: 'video/mpeg',
};
}

if (this.check([0x00, 0x01, 0x00, 0x00, 0x00])) {
return {
ext: 'ttf',
mime: 'font/ttf',
};
}

if (this.check([0x00, 0x00, 0x01, 0x00])) {
return {
ext: 'ico',
mime: 'image/x-icon',
};
}

if (this.check([0x00, 0x00, 0x02, 0x00])) {
return {
ext: 'cur',
mime: 'image/x-icon',
};
}

// Check MPEG 1 or 2 Layer 3 header, or 'layer 0' for ADTS (MPEG sync-word 0xFFE)
if (this.buffer.length >= 2 && this.check([0xFF, 0xE0], {offset: 0, mask: [0xFF, 0xE0]})) {
Expand Down
11 changes: 6 additions & 5 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,9 @@ Below is an example of a custom detector array. This can be passed to the `FileT
```js
import {FileTypeParser} from 'file-type';

const customDetectors = [
async tokenizer => {
const unicornDetector = {
id: 'unicorn', // May be used to recognize the detector in the detector list
async detect(tokenizer) {
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" in ASCII decimal

const buffer = new Uint8Array(unicornHeader.length);
Expand All @@ -375,11 +376,11 @@ const customDetectors = [
}

return undefined;
},
];
}
}

const buffer = new Uint8Array([85, 78, 73, 67, 79, 82, 78]);
const parser = new FileTypeParser({customDetectors});
const parser = new FileTypeParser({customDetectors: [unicornDetector]});
const fileType = await parser.fromBuffer(buffer);
console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'}
```
Expand Down
33 changes: 21 additions & 12 deletions test.js
Original file line number Diff line number Diff line change
Expand Up @@ -688,22 +688,31 @@ test('corrupt MKV throws', async t => {
});

// Create a custom detector for the just made up "unicorn" file type
const unicornDetector = async tokenizer => {
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
const buffer = new Uint8Array(7);
await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
if (unicornHeader.every((value, index) => value === buffer[index])) {
return {ext: 'unicorn', mime: 'application/unicorn'};
}
const unicornDetector = {
id: 'mock.unicorn',
async detect(tokenizer) {
const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" as decimal string
const buffer = new Uint8Array(7);
await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true});
if (unicornHeader.every((value, index) => value === buffer[index])) {
return {ext: 'unicorn', mime: 'application/unicorn'};
}

return undefined;
return undefined;
},
};

const mockPngDetector = _tokenizer => ({ext: 'mockPng', mime: 'image/mockPng'});
const mockPngDetector = {
id: 'mock.png',
detect: () => ({ext: 'mockPng', mime: 'image/mockPng'}),
};

const tokenizerPositionChanger = tokenizer => {
const buffer = new Uint8Array(1);
tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
const tokenizerPositionChanger = {
id: 'mock.dirtyTokenizer',
detect(tokenizer) {
const buffer = new Uint8Array(1);
tokenizer.readBuffer(buffer, {length: 1, mayBeLess: true});
},
};

if (nodeMajorVersion >= nodeVersionSupportingByteBlobStream) {
Expand Down

0 comments on commit 356bce8

Please sign in to comment.