diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a54d565..c154996 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,13 +21,9 @@ jobs: run: npm ci - name: Build library run: make lib - - name: post-lib node info - run: which node && node --version && npm --version - name: Typecheck run: make typecheck - name: Check formatting run: make checkformat - - name: Mode node info - run: which node && node --version && npm --version - name: Run tests run: make test diff --git a/.node-version b/.node-version index 4486a40..510c921 100644 --- a/.node-version +++ b/.node-version @@ -1 +1 @@ -22.2.0 \ No newline at end of file +22.3.0 \ No newline at end of file diff --git a/Makefile b/Makefile index 2ec144c..ad40928 100644 --- a/Makefile +++ b/Makefile @@ -34,20 +34,11 @@ checkformat: node_modules/.bin/prettier --check {src,test}/**/*.js .PHONY: typecheck -typecheck: +typecheck: build/tesseract-core.d.ts node_modules/.bin/tsc .PHONY: test test: third_party/tessdata_fast - SHELL=/bin/bash - echo "Current Shell: $$SHELL" - echo "PATH: $$PATH" - echo "Checking user permissions:" - id - which node - type node - alias - node --version node --test test/ocr-engine-test.js .PHONY: release @@ -161,7 +152,6 @@ build/tesseract-core.js build/tesseract-core.wasm: src/lib.cpp src/tesseract-ini emcc src/lib.cpp $(EMCC_FLAGS) \ -I$(INSTALL_DIR)/include/ -L$(INSTALL_DIR)/lib/ -ltesseract -lleptonica -lembind \ -o build/tesseract-core.js - cp src/tesseract-core.d.ts build/ # Build fallback WASM binary for browsers that don't support WASM SIMD. The JS # output from this build is not used. @@ -170,6 +160,9 @@ build/tesseract-core-fallback.js build/tesseract-core-fallback.wasm: src/lib.cpp -I$(INSTALL_DIR)/include/ -L$(FALLBACK_INSTALL_DIR)/lib/ -L$(INSTALL_DIR)/lib -ltesseract -lleptonica -lembind \ -o build/tesseract-core-fallback.js +build/tesseract-core.d.ts: src/tesseract-core.d.ts build + cp $< $@ + dist/tesseract-core.wasm: build/tesseract-core.wasm mkdir -p dist/ cp $< $@ diff --git a/src/ocr-engine.ts b/src/ocr-engine.ts index 3aeffcb..fa40217 100644 --- a/src/ocr-engine.ts +++ b/src/ocr-engine.ts @@ -1,5 +1,4 @@ -// @ts-ignore - Don't error if library hasn't been built yet. -import initTesseractCore from "../build/tesseract-core"; +import initTesseractCore, {MainModule, OCREngine as WASMOCREngine} from "../build/tesseract-core"; import { imageDataFromBitmap } from "./utils"; @@ -89,8 +88,8 @@ export type ProgressListener = (progress: number) => void; * Instances are constructed using {@link createOCREngine}. */ export class OCREngine { - private _tesseractLib: any; - private _engine: any; + private _tesseractLib: MainModule; + private _engine: WASMOCREngine; private _modelLoaded: boolean; private _imageLoaded: boolean; private _progressChannel?: MessagePort; @@ -104,7 +103,7 @@ export class OCREngine { * @param progressChannel - Channel used to report progress * updates when OCREngine is run on a background thread */ - constructor(tessLib: any, progressChannel?: MessagePort) { + constructor(tessLib: MainModule, progressChannel?: MessagePort) { this._tesseractLib = tessLib; this._engine = new tessLib.OCREngine(); this._modelLoaded = false; @@ -116,8 +115,7 @@ export class OCREngine { * Shut down the OCR engine and free up resources. */ destroy() { - this._engine.delete(); - this._engine = null; + this._engine?.delete(); } /** @@ -130,7 +128,7 @@ export class OCREngine { if (!result.success) { throw new Error(`Unable to get variable ${name}`); } - return result.value; + return result.value.toString(); } /** @@ -192,7 +190,7 @@ export class OCREngine { // Tesseract const engineImage = new this._tesseractLib.Image( imageData.width, - imageData.height + imageData.height, ); const engineImageBuf = engineImage.data(); engineImageBuf.set(new Uint32Array(imageData.data.buffer)); @@ -262,7 +260,7 @@ export class OCREngine { this._engine.getTextBoxes(textUnit, (progress: number) => { onProgress?.(progress); this._progressChannel?.postMessage({ progress }); - }) + }), ); } @@ -380,7 +378,7 @@ export type CreateOCREngineOptions = { * used to create the tesseract module. Possible options are documented here: * https://github.com/emscripten-core/emscripten/blob/1e7472362a7f5844c5bd23214d725b7a3fd18775/src/settings.js#L876 */ - emscriptenModuleOptions?: {wasmBinary: ArrayBuffer}; + emscriptenModuleOptions?: { wasmBinary: ArrayBuffer }; }; /** diff --git a/src/tesseract-core.d.ts b/src/tesseract-core.d.ts index 4bc0cd3..7522d27 100644 --- a/src/tesseract-core.d.ts +++ b/src/tesseract-core.d.ts @@ -1,5 +1,102 @@ -// Type definitions for the Emscripten-generated JS entry point for the -// WASM file. -export default function initTesseractCore( - options: { wasmBinary?: ArrayBuffer } = {} -): Promise; +// The file below was generated with emcc --emit-tsd +// and edited by hand where noted +// TypeScript bindings for emscripten-generated code. Automatically generated at compile time. +declare namespace RuntimeExports { + let HEAPF32: any; + let HEAPF64: any; + let HEAP_DATA_VIEW: any; + let HEAP8: any; + let HEAPU8: any; + let HEAP16: any; + let HEAPU16: any; + let HEAP32: any; + let HEAPU32: any; + let HEAP64: any; + let HEAPU64: any; +} +interface WasmModule { +} + +type EmbindString = ArrayBuffer|Uint8Array|Uint8ClampedArray|Int8Array|string; +export interface Image { + data(): any; + delete(): void; +} + +export interface OCREngine { + getBoundingBoxes(_0: TextUnit): vector; + clearImage(): void; + getOrientation(): Orientation; + getVariable(_0: EmbindString): GetVariableResult; + loadImage(_0: Image): OCRResult; + loadModel(_0: EmbindString): OCRResult; + setVariable(_0: EmbindString, _1: EmbindString): OCRResult; + getHOCR(_0: any): string; + getText(_0: any): string; + getTextBoxes(_0: TextUnit, _1: any): vector; + delete(): void; +} + +export interface TextUnitValue { + value: T; +} +export type TextUnit = TextUnitValue<1>|TextUnitValue<0>; + +export interface vector { + push_back(_0: IntRect): void; + resize(_0: number, _1: IntRect): void; + size(): number; + get(_0: number): IntRect | undefined; + set(_0: number, _1: IntRect): boolean; + delete(): void; +} + +export interface vector { + size(): number; + get(_0: number): TextRect | undefined; + push_back(_0: TextRect): void; + resize(_0: number, _1: TextRect): void; + set(_0: number, _1: TextRect): boolean; + delete(): void; +} + +export type IntRect = { + left: number, + top: number, + right: number, + bottom: number +}; + +export type Orientation = { + rotation: number, + confidence: number +}; + +export type TextRect = { + rect: IntRect, + flags: number, + confidence: number, + text: EmbindString +}; + +export type GetVariableResult = { + success: boolean, + value: EmbindString +}; + +export type OCRResult = { + error: EmbindString +}; + +interface EmbindModule { + Image: {new(_0: number, _1: number): Image}; + OCREngine: {new(): OCREngine}; + TextUnit: {Line: TextUnitValue<1>, Word: TextUnitValue<0>}; + // the following two invalid lines are commented out by hand: + // they seem to be a result of a bug in the emscripten bindings + // vector: {new(): vector}; + // vector: {new(): vector}; +} + +export type MainModule = WasmModule & typeof RuntimeExports & EmbindModule; +export default function MainModuleFactory (options?: unknown): Promise;