From 59d6f9b6b6ef6f4b8c902e62e91bd3ca14afdbb8 Mon Sep 17 00:00:00 2001 From: Matt Leon <108271225+wydengyre@users.noreply.github.com> Date: Sat, 15 Jun 2024 08:30:43 -0400 Subject: [PATCH 1/7] tesseract-code-d-ts style (#46) Preparing for use of biome. --- src/tesseract-core.d.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tesseract-core.d.ts b/src/tesseract-core.d.ts index 4bc0cd3..24c4b4e 100644 --- a/src/tesseract-core.d.ts +++ b/src/tesseract-core.d.ts @@ -1,5 +1,5 @@ // Type definitions for the Emscripten-generated JS entry point for the // WASM file. export default function initTesseractCore( - options: { wasmBinary?: ArrayBuffer } = {} + options: { wasmBinary?: ArrayBuffer } = {}, ): Promise; From a170b3feec87c671ec8b5264c7dc94ba24bad000 Mon Sep 17 00:00:00 2001 From: Matt Leon <108271225+wydengyre@users.noreply.github.com> Date: Sat, 15 Jun 2024 08:36:49 -0400 Subject: [PATCH 2/7] node 22.3.0 (#48) --- .node-version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.node-version b/.node-version index 4486a40..510c921 100644 --- a/.node-version +++ b/.node-version @@ -1 +1 @@ -22.2.0 \ No newline at end of file +22.3.0 \ No newline at end of file From 6c02392c175e241e8b907f27ccd0861f1a34ac17 Mon Sep 17 00:00:00 2001 From: Matt Leon <108271225+wydengyre@users.noreply.github.com> Date: Sat, 15 Jun 2024 08:43:17 -0400 Subject: [PATCH 3/7] ocr-engine style (#47) Preparing for use of biome. --- src/ocr-engine.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ocr-engine.ts b/src/ocr-engine.ts index 3aeffcb..cbc7dda 100644 --- a/src/ocr-engine.ts +++ b/src/ocr-engine.ts @@ -192,7 +192,7 @@ export class OCREngine { // Tesseract const engineImage = new this._tesseractLib.Image( imageData.width, - imageData.height + imageData.height, ); const engineImageBuf = engineImage.data(); engineImageBuf.set(new Uint32Array(imageData.data.buffer)); @@ -262,7 +262,7 @@ export class OCREngine { this._engine.getTextBoxes(textUnit, (progress: number) => { onProgress?.(progress); this._progressChannel?.postMessage({ progress }); - }) + }), ); } @@ -380,7 +380,7 @@ export type CreateOCREngineOptions = { * used to create the tesseract module. Possible options are documented here: * https://github.com/emscripten-core/emscripten/blob/1e7472362a7f5844c5bd23214d725b7a3fd18775/src/settings.js#L876 */ - emscriptenModuleOptions?: {wasmBinary: ArrayBuffer}; + emscriptenModuleOptions?: { wasmBinary: ArrayBuffer }; }; /** From e5bd6f93c8c03172f724b357b572e0d3fbadc205 Mon Sep 17 00:00:00 2001 From: Matt Leon <108271225+wydengyre@users.noreply.github.com> Date: Sat, 15 Jun 2024 09:06:19 -0400 Subject: [PATCH 4/7] better tesseract-core.d.ts (#49) Use new emscript tooling to improve these types. Sadly, we can't make this part of the makefile yet, as it generates some invalid lines that have to be manually commented out. --- src/tesseract-core.d.ts | 107 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 102 insertions(+), 5 deletions(-) diff --git a/src/tesseract-core.d.ts b/src/tesseract-core.d.ts index 24c4b4e..7522d27 100644 --- a/src/tesseract-core.d.ts +++ b/src/tesseract-core.d.ts @@ -1,5 +1,102 @@ -// Type definitions for the Emscripten-generated JS entry point for the -// WASM file. -export default function initTesseractCore( - options: { wasmBinary?: ArrayBuffer } = {}, -): Promise; +// The file below was generated with emcc --emit-tsd +// and edited by hand where noted +// TypeScript bindings for emscripten-generated code. Automatically generated at compile time. +declare namespace RuntimeExports { + let HEAPF32: any; + let HEAPF64: any; + let HEAP_DATA_VIEW: any; + let HEAP8: any; + let HEAPU8: any; + let HEAP16: any; + let HEAPU16: any; + let HEAP32: any; + let HEAPU32: any; + let HEAP64: any; + let HEAPU64: any; +} +interface WasmModule { +} + +type EmbindString = ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string; +export interface Image { + data(): any; + delete(): void; +} + +export interface OCREngine { + getBoundingBoxes(_0: TextUnit): vector; + clearImage(): void; + getOrientation(): Orientation; + getVariable(_0: EmbindString): GetVariableResult; + loadImage(_0: Image): OCRResult; + loadModel(_0: EmbindString): OCRResult; + setVariable(_0: EmbindString, _1: EmbindString): OCRResult; + getHOCR(_0: any): string; + getText(_0: any): string; + getTextBoxes(_0: TextUnit, _1: any): vector; + delete(): void; +} + +export interface TextUnitValue { + value: T; +} +export type TextUnit = TextUnitValue<1>|TextUnitValue<0>; + +export interface vector { + push_back(_0: IntRect): void; + resize(_0: number, _1: IntRect): void; + size(): number; + get(_0: number): IntRect | undefined; + set(_0: number, _1: IntRect): boolean; + delete(): void; +} + +export interface vector { + size(): number; + get(_0: number): TextRect | undefined; + push_back(_0: TextRect): void; + resize(_0: number, _1: TextRect): void; + set(_0: number, _1: TextRect): boolean; + delete(): void; +} + +export type IntRect = { + left: number, + top: number, + right: number, + bottom: number +}; + +export type Orientation = { + rotation: number, + confidence: number +}; + +export type TextRect = { + rect: IntRect, + flags: number, + confidence: number, + text: EmbindString +}; + +export type GetVariableResult = { + success: boolean, + value: EmbindString +}; + +export type OCRResult = { + error: EmbindString +}; + +interface EmbindModule { + Image: {new(_0: number, _1: number): Image}; + OCREngine: {new(): OCREngine}; + TextUnit: {Line: TextUnitValue<1>, Word: TextUnitValue<0>}; + // the following two invalid lines are commented out by hand: + // they seem to be a result of a bug in the emscripten bindings + // vector: {new(): vector}; + // vector: {new(): vector}; +} + +export type MainModule = WasmModule & typeof RuntimeExports & EmbindModule; +export default function MainModuleFactory (options?: unknown): Promise; From 9ad0b8e089dfe2d8090cd84e5c45199676004cbc Mon Sep 17 00:00:00 2001 From: Matt Leon <108271225+wydengyre@users.noreply.github.com> Date: Sat, 15 Jun 2024 09:15:31 -0400 Subject: [PATCH 5/7] makefile: separate out type definition move (#50) --- Makefile | 6 ++++-- src/ocr-engine.ts | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 3e2a81e..9a0242d 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,7 @@ checkformat: node_modules/.bin/prettier --check {src,test}/**/*.js .PHONY: typecheck -typecheck: +typecheck: build/tesseract-core.d.ts node_modules/.bin/tsc .PHONY: test @@ -164,7 +164,6 @@ build/tesseract-core.js build/tesseract-core.wasm: src/lib.cpp src/tesseract-ini $(EMSDK_DIR)/emcc src/lib.cpp $(EMCC_FLAGS) \ -I$(INSTALL_DIR)/include/ -L$(INSTALL_DIR)/lib/ -ltesseract -lleptonica -lembind \ -o build/tesseract-core.js - cp src/tesseract-core.d.ts build/ # Build fallback WASM binary for browsers that don't support WASM SIMD. The JS # output from this build is not used. @@ -173,6 +172,9 @@ build/tesseract-core-fallback.js build/tesseract-core-fallback.wasm: src/lib.cpp -I$(INSTALL_DIR)/include/ -L$(FALLBACK_INSTALL_DIR)/lib/ -L$(INSTALL_DIR)/lib -ltesseract -lleptonica -lembind \ -o build/tesseract-core-fallback.js +build/tesseract-core.d.ts: src/tesseract-core.d.ts build + cp $< $@ + dist/tesseract-core.wasm: build/tesseract-core.wasm mkdir -p dist/ cp $< $@ diff --git a/src/ocr-engine.ts b/src/ocr-engine.ts index cbc7dda..82e136a 100644 --- a/src/ocr-engine.ts +++ b/src/ocr-engine.ts @@ -1,4 +1,3 @@ -// @ts-ignore - Don't error if library hasn't been built yet. import initTesseractCore from "../build/tesseract-core"; import { imageDataFromBitmap } from "./utils"; From d957655c02adea013cfd68c1a834615f8bb54e6f Mon Sep 17 00:00:00 2001 From: Matt Leon <108271225+wydengyre@users.noreply.github.com> Date: Sat, 15 Jun 2024 12:07:04 -0400 Subject: [PATCH 6/7] ocr-engine: tighten types (#51) Use our newfangled emscripten-generated types to eliminate use of any in ocr-engine.ts --- src/ocr-engine.ts | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/ocr-engine.ts b/src/ocr-engine.ts index 82e136a..fa40217 100644 --- a/src/ocr-engine.ts +++ b/src/ocr-engine.ts @@ -1,4 +1,4 @@ -import initTesseractCore from "../build/tesseract-core"; +import initTesseractCore, {MainModule, OCREngine as WASMOCREngine} from "../build/tesseract-core"; import { imageDataFromBitmap } from "./utils"; @@ -88,8 +88,8 @@ export type ProgressListener = (progress: number) => void; * Instances are constructed using {@link createOCREngine}. */ export class OCREngine { - private _tesseractLib: any; - private _engine: any; + private _tesseractLib: MainModule; + private _engine: WASMOCREngine; private _modelLoaded: boolean; private _imageLoaded: boolean; private _progressChannel?: MessagePort; @@ -103,7 +103,7 @@ export class OCREngine { * @param progressChannel - Channel used to report progress * updates when OCREngine is run on a background thread */ - constructor(tessLib: any, progressChannel?: MessagePort) { + constructor(tessLib: MainModule, progressChannel?: MessagePort) { this._tesseractLib = tessLib; this._engine = new tessLib.OCREngine(); this._modelLoaded = false; @@ -115,8 +115,7 @@ export class OCREngine { * Shut down the OCR engine and free up resources. */ destroy() { - this._engine.delete(); - this._engine = null; + this._engine?.delete(); } /** @@ -129,7 +128,7 @@ export class OCREngine { if (!result.success) { throw new Error(`Unable to get variable ${name}`); } - return result.value; + return result.value.toString(); } /** From 251a4e7a6b2272bf87178e36417e757e23a9d466 Mon Sep 17 00:00:00 2001 From: Matt Leon <108271225+wydengyre@users.noreply.github.com> Date: Sat, 15 Jun 2024 12:12:19 -0400 Subject: [PATCH 7/7] emsdk 3.1.61 (#52) --- third_party_versions.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party_versions.mk b/third_party_versions.mk index 2615930..b384878 100644 --- a/third_party_versions.mk +++ b/third_party_versions.mk @@ -1,5 +1,5 @@ -# v3.1.60 -EMSDK_COMMIT=ce74ca2b1c968f897150bdc55daa9e3c12a3fefc +# v3.1.61 +EMSDK_COMMIT=ca7b40ae222a2d8763b6ac845388744b0e57cfb7 # v1.84.1 LEPTONICA_COMMIT=7e803e73511fbd320f01314c141d35d2b8491dde