From 5e11f85db1c81d83bb9a0abf81e33b7e11d6d379 Mon Sep 17 00:00:00 2001 From: lutangar Date: Thu, 2 May 2024 14:57:10 +0200 Subject: [PATCH] chore(transcription): somplify run object only a uuid is now needed and add more benchmark scenario --- .../tests/src/transcription/benchmark.spec.ts | 58 ++++++++++--------- .../transcription/transcription-run.spec.ts | 57 +----------------- .../transcription/src/abstract-transcriber.ts | 8 +-- .../transcription/src/transcription-run.ts | 25 ++------ .../transcriber/ctranslate2-transcriber.ts | 6 +- .../whisper/transcriber/openai-transcriber.ts | 6 +- .../transcriber/timestamped-transcriber.ts | 6 +- 7 files changed, 49 insertions(+), 117 deletions(-) diff --git a/packages/tests/src/transcription/benchmark.spec.ts b/packages/tests/src/transcription/benchmark.spec.ts index 9d1f55e76a2..da7630475e6 100644 --- a/packages/tests/src/transcription/benchmark.spec.ts +++ b/packages/tests/src/transcription/benchmark.spec.ts @@ -1,5 +1,5 @@ import { createLogger } from 'winston' -import short, { UUID } from 'short-uuid' +import short, { SUUID } from 'short-uuid' import { performance, PerformanceObserver } from 'node:perf_hooks' // import { CpuInfo, CpuUsage } from 'node:os' import { rm, mkdir } from 'node:fs/promises' @@ -9,16 +9,16 @@ import { transcriberFactory, TranscriptFile, TranscriptFileEvaluator, - TranscriptionEngine, TranscriptionRun + TranscriptionEngine } from '@peertube/peertube-transcription' interface TestResult { - uuid: string - WER: number - CER: number - duration: number - engine: TranscriptionEngine - model: string + uuid: SUUID + WER?: number + CER?: number + duration?: number + engine?: TranscriptionEngine + model?: string // dataThroughput: number // relevant ? // cpus: CpuInfo[] // https://nodejs.org/docs/latest-v18.x/api/os.html#oscpus // cpuUsages: CpuUsage[] // https://nodejs.org/docs/latest-v18.x/api/process.html#processcpuusagepreviousvalue @@ -27,12 +27,12 @@ interface TestResult { // memoryUsages: Record // https://nodejs.org/docs/latest-v18.x/api/process.html#processmemoryusage } -type Benchmark = Record> +type Benchmark = Record -const benchmarkReducer = (benchmark: Benchmark = {}, uuid: string, testResult: Partial) => ({ +const benchmarkReducer = (benchmark: Benchmark = {}, testResult: TestResult) => ({ ...benchmark, - [uuid]: { - ...benchmark[uuid], + [testResult.uuid]: { + ...benchmark[testResult.uuid], ...testResult } }) @@ -42,13 +42,15 @@ interface FormattedTestResult { CER?: string duration?: string model?: string + engine?: string } -const formatTestResult = ({ WER, CER, duration, model }: Partial): FormattedTestResult => ({ +const formatTestResult = ({ WER, CER, duration, engine, model }: Partial): FormattedTestResult => ({ WER: WER ? `${WER * 100}%` : undefined, CER: CER ? `${CER * 100}%` : undefined, duration: duration ? toHumanReadable(duration) : undefined, - model + model, + engine: engine.name }) describe('Transcribers benchmark', function () { @@ -70,7 +72,7 @@ describe('Transcribers benchmark', function () { format: 'txt' }) - let benchmark: Record> = {} + let benchmark: Record = {} before(async function () { await mkdir(transcriptDirectory, { recursive: true }) @@ -79,9 +81,8 @@ describe('Transcribers benchmark', function () { items .getEntries() .forEach((entry) => { - const { uuid } = TranscriptionRun.extractFromId(entry.name) - - benchmark = benchmarkReducer(benchmark, uuid, { + benchmark = benchmarkReducer(benchmark, { + uuid: entry.name as SUUID, duration: entry.duration }) }) @@ -99,14 +100,15 @@ describe('Transcribers benchmark', function () { models.forEach((modelName) => { it(`Run ${transcriberName} transcriber benchmark with ${modelName} model`, async function () { - this.timeout(1000000) + this.timeout(15 * 1000 * 60) // 15 minutes const model = { name: modelName } - const uuid = short.uuid() + const uuid = short.generate() const transcriptFile = await transcriber.transcribe(mediaFilePath, model, 'fr', 'txt', uuid) const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile) await new Promise(resolve => setTimeout(resolve, 1)) - benchmark = benchmarkReducer(benchmark, uuid, { + benchmark = benchmarkReducer(benchmark, { + uuid, engine: transcriber.engine, WER: await evaluator.wer(), CER: await evaluator.cer(), @@ -118,14 +120,16 @@ describe('Transcribers benchmark', function () { }) after(async function () { - console.table( - Object + const benchmarksGroupedByModel = Object .keys(benchmark) - .reduce((formattedBenchmark, engineName, currentIndex, array) => ({ - ...formattedBenchmark, - [engineName]: formatTestResult(benchmark[engineName]) + .reduce((benchmarksGroupedByModel, uuid, currentIndex, array) => ({ + ...benchmarksGroupedByModel, + [benchmark[uuid].model]: { + ...benchmarksGroupedByModel[benchmark[uuid].model], + [uuid]: formatTestResult(benchmark[uuid]) + } }), {}) - ) + Object.values(benchmarksGroupedByModel).forEach(benchmark => console.table(benchmark)) await rm(transcriptDirectory, { recursive: true, force: true }) diff --git a/packages/tests/src/transcription/transcription-run.spec.ts b/packages/tests/src/transcription/transcription-run.spec.ts index d7385dba72f..d877fbded20 100644 --- a/packages/tests/src/transcription/transcription-run.spec.ts +++ b/packages/tests/src/transcription/transcription-run.spec.ts @@ -1,56 +1 @@ -/* eslint-disable @typescript-eslint/no-unused-expressions */ -import { expect } from 'chai' -import { TranscriptionRun } from '@peertube/peertube-transcription' -import { UUID } from 'short-uuid' - -describe('Transcription run', function () { - const supposedlyValidIds = [ - 'a44521d0-0fb8-4ade-8002-3385545c3318_openai-whisper_tiny', - 'a44521d0-0fb8-4ade-8002-3385545c3318_openai-whisper_openai/tiny', - '0f229848-b709-4373-a49c-80dcc0d39e2a_whisper-ctranslate2_tiny' - ] - - it(`matches the list of supposedly valid ids`, function () { - supposedlyValidIds.forEach((id) => { - expect(id.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok - expect(TranscriptionRun.extractFromId(id)).to.be.ok - }) - }) - - it(`creates a valid run id`, function () { - const runId = TranscriptionRun.createId({ - name: 'engine-name', - binary: '/bin/engine-name', - requirements: [], - type: 'binary', - supportedModelFormats: [] - }, { name: 'openai/tiny' }) - - expect(runId.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok - - const found = TranscriptionRun.RUN_ID_MASK.exec(runId) - expect(found[2]).to.equals('engine-name') - expect(found[3]).to.equals('openai/tiny') - }) - - it(`extracts information from a run id`, function () { - // Because it's a "Branded primitive" - // https://github.com/microsoft/TypeScript/wiki/FAQ#can-i-make-a-type-alias-nominal - const expectedUuid = 'a44521d0-0fb8-4ade-8002-3385545c3318' as UUID - const runId = TranscriptionRun.createId({ - name: 'engine-name', - binary: '/bin/engine-name', - requirements: [], - type: 'binary', - supportedModelFormats: [] - }, { name: 'openai/tiny' }, expectedUuid) - - expect(runId.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok - - const { uuid, engineName, modelName } = TranscriptionRun.extractFromId(runId) - expect(uuid).to.equals(expectedUuid) - expect(engineName).to.equals('engine-name') - expect(modelName).to.equals('openai/tiny') - - }) -}) +describe('Transcription run', function () {}) diff --git a/packages/transcription/src/abstract-transcriber.ts b/packages/transcription/src/abstract-transcriber.ts index 3cebc858bb2..cb776f4ed42 100644 --- a/packages/transcription/src/abstract-transcriber.ts +++ b/packages/transcription/src/abstract-transcriber.ts @@ -1,5 +1,5 @@ import { createLogger, Logger } from 'winston' -import short, { UUID } from 'short-uuid' +import short, { SUUID } from 'short-uuid' import { join } from 'node:path' import { existsSync } from 'node:fs' import { PerformanceObserver } from 'node:perf_hooks' @@ -30,8 +30,8 @@ export abstract class AbstractTranscriber { this.performanceObserver = performanceObserver } - createRun (model: TranscriptionModel, uuid = short.uuid()) { - this.run = new TranscriptionRun(this.engine, model, this.logger, uuid) + createRun (uuid: SUUID = short.generate()) { + this.run = new TranscriptionRun(this.logger, uuid) } startRun () { @@ -60,6 +60,6 @@ export abstract class AbstractTranscriber { model: TranscriptionModel, language: string, format: TranscriptFormat, - runId: UUID + runId: SUUID ): Promise } diff --git a/packages/transcription/src/transcription-run.ts b/packages/transcription/src/transcription-run.ts index c2b9337786b..6739195cda1 100644 --- a/packages/transcription/src/transcription-run.ts +++ b/packages/transcription/src/transcription-run.ts @@ -1,34 +1,17 @@ -import short, { UUID } from 'short-uuid' +import short, { SUUID } from 'short-uuid' import { createLogger, Logger } from 'winston' -import { TranscriptionModel } from './transcription-model.js' -import { TranscriptionEngine } from './transcription-engine.js' export class TranscriptionRun { - uuid: UUID - engine: TranscriptionEngine - model: TranscriptionModel + uuid: SUUID logger: Logger - static RUN_ID_MASK = /^([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-/]+)/i - - constructor (engine: TranscriptionEngine, model: TranscriptionModel, logger = createLogger(), uuid?: UUID) { + constructor (logger = createLogger(), uuid: SUUID = short.generate()) { this.uuid = uuid - this.engine = engine - this.model = model this.logger = logger } - static createId (engine: TranscriptionEngine, model: TranscriptionModel, uuid = short.uuid()) { - return `${uuid}_${engine.name}_${model.name}` - } - - static extractFromId (runId: string) { - const [ , uuid, engineName, modelName ] = TranscriptionRun.RUN_ID_MASK.exec(runId) - return { uuid, engineName, modelName } - } - get runId () { - return TranscriptionRun.createId(this.engine, this.model, this.uuid) + return this.uuid } start () { diff --git a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts index 182e5afcecd..5e64718763c 100644 --- a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts @@ -1,5 +1,5 @@ import { $ } from 'execa' -import short, { UUID } from 'short-uuid' +import short, { SUUID } from 'short-uuid' import { join } from 'path' import { lstat } from 'node:fs/promises' import { OpenaiTranscriber } from './openai-transcriber.js' @@ -13,7 +13,7 @@ export class Ctranslate2Transcriber extends OpenaiTranscriber { model: TranscriptionModel = { name: 'tiny' }, language: string = 'en', format: TranscriptFormat = 'vtt', - runId: UUID = short.uuid() + runId: SUUID = short.generate() ): Promise { // Shall we run the command with `{ shell: true }` to get the same error as in sh ? // ex: ENOENT => Command not found @@ -25,7 +25,7 @@ export class Ctranslate2Transcriber extends OpenaiTranscriber { } const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ] - this.createRun(model, runId) + this.createRun(runId) this.startRun() await $$`${this.engine.binary} ${[ mediaFilePath, diff --git a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts index 50629bc1c84..d9f8ce13b71 100644 --- a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts @@ -1,6 +1,6 @@ import { join } from 'path' import { $ } from 'execa' -import short, { UUID } from 'short-uuid' +import short, { SUUID } from 'short-uuid' import { TranscriptionModel } from '../../transcription-model.js' import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' import { AbstractTranscriber } from '../../abstract-transcriber.js' @@ -12,14 +12,14 @@ export class OpenaiTranscriber extends AbstractTranscriber { model: TranscriptionModel = { name: 'tiny' }, language: string = 'en', format: TranscriptFormat = 'vtt', - runId: UUID = short.uuid() + runId: SUUID = short.generate() ): Promise { // Shall we run the command with `{ shell: true }` to get the same error as in sh ? // ex: ENOENT => Command not found const $$ = $({ verbose: true }) const { baseName } = getFileInfo(mediaFilePath) - this.createRun(model, runId) + this.createRun(runId) this.startRun() await $$`${this.engine.binary} ${[ mediaFilePath, diff --git a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts index bd8d65c790f..e23e3d3eeb1 100644 --- a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts @@ -1,5 +1,5 @@ import { $ } from 'execa' -import short, { UUID } from 'short-uuid' +import short, { SUUID } from 'short-uuid' import assert from 'node:assert' import { join } from 'node:path' import { existsSync } from 'node:fs' @@ -15,12 +15,12 @@ export class WhisperTimestampedTranscriber extends OpenaiTranscriber { model: TranscriptionModel, language: string, format: TranscriptFormat = 'vtt', - runId: UUID = short.uuid() + runId: SUUID = short.generate() ): Promise { const $$ = $({ verbose: true }) const { baseName, name } = getFileInfo(mediaFilePath) - this.createRun(model, runId) + this.createRun(runId) this.startRun() await $$`${this.engine.binary} ${[ mediaFilePath,