From 7e72bbccd317efc5d5aed55fdd3af72d3857c781 Mon Sep 17 00:00:00 2001 From: Borewit Date: Thu, 19 Dec 2024 18:34:57 +0100 Subject: [PATCH] Give API access to `FileTypeParser#detectors` (#704) Co-authored-by: Sindre Sorhus --- core.d.ts | 45 +++++++++++++++++++++++++++------------------ core.js | 14 +++++--------- readme.md | 42 +++++++++++++++++++++++++++--------------- 3 files changed, 59 insertions(+), 42 deletions(-) diff --git a/core.d.ts b/core.d.ts index d782a9a5..543c452b 100644 --- a/core.d.ts +++ b/core.d.ts @@ -114,31 +114,35 @@ console.log(await fileTypeFromBlob(blob)); export declare function fileTypeFromBlob(blob: Blob): Promise; /** -Function that allows specifying custom detection mechanisms. +A custom file type detector. -An iterable of detectors can be provided via the `fileTypeOptions` argument for the {@link FileTypeParser.constructor}. +Detectors can be added via the constructor options or by directly modifying `FileTypeParser#detectors`. -The detectors are called before the default detections in the provided order. +Detectors provided through the constructor options are executed before the default detectors. -Custom detectors can be used to add new `FileTypeResults` or to modify return behavior of existing `FileTypeResult` detections. +Custom detectors allow for: +- Introducing new `FileTypeResult` entries. +- Modifying the detection behavior of existing `FileTypeResult` types. -If the detector returns `undefined`, there are 2 possible scenarios: +### Detector execution flow -1. The detector has not read from the tokenizer, it will be proceeded with the next available detector. -2. The detector has read from the tokenizer (`tokenizer.position` has been increased). - In that case no further detectors will be executed and the final conclusion is that file-type returns undefined. - Note that this an exceptional scenario, as the detector takes the opportunity from any other detector to determine the file type. +If a detector returns `undefined`, the following rules apply: -Example detector array which can be extended and provided via the fileTypeOptions argument: +1. **No Tokenizer Interaction**: If the detector does not modify the tokenizer's position, the next detector in the sequence is executed. +2. **Tokenizer Interaction**: If the detector modifies the tokenizer's position (`tokenizer.position` is advanced), no further detectors are executed. In this case, the file type remains `undefined`, as subsequent detectors cannot evaluate the content. This is an exceptional scenario, as it prevents any other detectors from determining the file type. + +### Example usage + +Below is an example of a custom detector array. This can be passed to the `FileTypeParser` via the `fileTypeOptions` argument. ``` import {FileTypeParser} from 'file-type'; const customDetectors = [ async tokenizer => { - const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // 'UNICORN' as decimal string + const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" in ASCII decimal - const buffer = Buffer.alloc(7); + const buffer = new Uint8Array(unicornHeader.length); await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); if (unicornHeader.every((value, index) => value === buffer[index])) { return {ext: 'unicorn', mime: 'application/unicorn'}; @@ -148,15 +152,15 @@ const customDetectors = [ }, ]; -const buffer = Buffer.from('UNICORN'); +const buffer = new Uint8Array([85, 78, 73, 67, 79, 82, 78]); const parser = new FileTypeParser({customDetectors}); const fileType = await parser.fromBuffer(buffer); -console.log(fileType); +console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'} ``` -@param tokenizer - The [tokenizer](https://github.com/Borewit/strtok3#tokenizer) used to read the file content from. -@param fileType - The file type detected by the standard detections or a previous custom detection, or `undefined`` if no matching file type could be found. -@returns The detected file type, or `undefined` when there is no match. +@param tokenizer - The [tokenizer](https://github.com/Borewit/strtok3#tokenizer) used to read file content. +@param fileType - The file type detected by standard or previous custom detectors, or `undefined` if no match is found. +@returns The detected file type, or `undefined` if no match is found. */ export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise; @@ -180,7 +184,12 @@ This method can be handy to put in a stream pipeline, but it comes with a price. export function fileTypeStream(webStream: AnyWebReadableStream, options?: StreamOptions): Promise; export declare class FileTypeParser { - detectors: Iterable; + /** + File type detectors. + + Initialized with a single entry holding the built-in detector function. + */ + detectors: Detector[]; constructor(options?: {customDetectors?: Iterable; signal?: AbortSignal}); diff --git a/core.js b/core.js index 131c9d74..ae1bea62 100644 --- a/core.js +++ b/core.js @@ -109,19 +109,17 @@ export async function fileTypeStream(webStream, options) { export class FileTypeParser { constructor(options) { - this.detectors = options?.customDetectors; + this.detectors = [...(options?.customDetectors ?? []), this.parse]; this.tokenizerOptions = { abortSignal: options?.signal, }; - this.fromTokenizer = this.fromTokenizer.bind(this); - this.fromBuffer = this.fromBuffer.bind(this); - this.parse = this.parse.bind(this); } async fromTokenizer(tokenizer) { const initialPosition = tokenizer.position; - for (const detector of this.detectors || []) { + // Iterate through all file-type detectors + for (const detector of this.detectors) { const fileType = await detector(tokenizer); if (fileType) { return fileType; @@ -131,8 +129,6 @@ export class FileTypeParser { return undefined; // Cannot proceed scanning of the tokenizer is at an arbitrary position } } - - return this.parse(tokenizer); } async fromBuffer(input) { @@ -215,7 +211,7 @@ export class FileTypeParser { return this.check(stringToBytes(header), options); } - async parse(tokenizer) { + parse = async tokenizer => { this.buffer = new Uint8Array(reasonableDetectionSizeInBytes); // Keep reading until EOF if the file size is unknown. @@ -1647,7 +1643,7 @@ export class FileTypeParser { }; } } - } + }; async readTiffTag(bigEndian) { const tagId = await this.tokenizer.readToken(bigEndian ? Token.UINT16_BE : Token.UINT16_LE); diff --git a/readme.md b/readme.md index a64e3942..01f81af5 100644 --- a/readme.md +++ b/readme.md @@ -340,33 +340,36 @@ Returns a `Set` of supported MIME types. ## Custom detectors -A custom detector is a function that allows specifying custom detection mechanisms. +A custom file type detector. -An iterable of detectors can be provided via the `fileTypeOptions` argument for the `FileTypeParser` constructor. +Detectors can be added via the constructor options or by directly modifying `FileTypeParser#detectors`. -The detectors are called before the default detections in the provided order. +Detectors provided through the constructor options are executed before the default detectors. -Custom detectors can be used to add new `FileTypeResults` or to modify return behaviour of existing `FileTypeResult` detections. +Custom detectors allow for: +- Introducing new `FileTypeResult` entries. +- Modifying the detection behavior of existing `FileTypeResult` types. -If the detector returns `undefined`, there are 2 possible scenarios: +### Detector execution flow -1. The detector has not read from the tokenizer, it will be proceeded with the next available detector. -2. The detector has read from the tokenizer (`tokenizer.position` has been increased). - In that case no further detectors will be executed and the final conclusion is that file-type returns undefined. - Note that this an exceptional scenario, as the detector takes the opportunity from any other detector to determine the file type. +If a detector returns `undefined`, the following rules apply: -Example detector array which can be extended and provided to each public method via the `fileTypeOptions` argument: +1. **No Tokenizer Interaction**: If the detector does not modify the tokenizer's position, the next detector in the sequence is executed. +2. **Tokenizer Interaction**: If the detector modifies the tokenizer's position (`tokenizer.position` is advanced), no further detectors are executed. In this case, the file type remains `undefined`, as subsequent detectors cannot evaluate the content. This is an exceptional scenario, as it prevents any other detectors from determining the file type. + +### Example usage + +Below is an example of a custom detector array. This can be passed to the `FileTypeParser` via the `fileTypeOptions` argument. ```js import {FileTypeParser} from 'file-type'; const customDetectors = [ async tokenizer => { - const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // 'UNICORN' as decimal string + const unicornHeader = [85, 78, 73, 67, 79, 82, 78]; // "UNICORN" in ASCII decimal - const buffer = new Uint8Array(7); + const buffer = new Uint8Array(unicornHeader.length); await tokenizer.peekBuffer(buffer, {length: unicornHeader.length, mayBeLess: true}); - if (unicornHeader.every((value, index) => value === buffer[index])) { return {ext: 'unicorn', mime: 'application/unicorn'}; } @@ -375,10 +378,19 @@ const customDetectors = [ }, ]; -const buffer = new Uint8Array(new TextEncoder().encode('UNICORN')); +const buffer = new Uint8Array([85, 78, 73, 67, 79, 82, 78]); const parser = new FileTypeParser({customDetectors}); const fileType = await parser.fromBuffer(buffer); -console.log(fileType); +console.log(fileType); // {ext: 'unicorn', mime: 'application/unicorn'} +``` + +```ts +/** +@param tokenizer - The [tokenizer](https://github.com/Borewit/strtok3#tokenizer) used to read file content. +@param fileType - The file type detected by standard or previous custom detectors, or `undefined` if no match is found. +@returns The detected file type, or `undefined` if no match is found. +*/ +export type Detector = (tokenizer: ITokenizer, fileType?: FileTypeResult) => Promise; ``` ## Abort signal