From 5e11f85db1c81d83bb9a0abf81e33b7e11d6d379 Mon Sep 17 00:00:00 2001
From: lutangar <johan@larriereguichet.fr>
Date: Thu, 2 May 2024 14:57:10 +0200
Subject: [PATCH] chore(transcription): somplify run object only a uuid is now
 needed and add more benchmark scenario

---
 .../tests/src/transcription/benchmark.spec.ts | 58 ++++++++++---------
 .../transcription/transcription-run.spec.ts   | 57 +-----------------
 .../transcription/src/abstract-transcriber.ts |  8 +--
 .../transcription/src/transcription-run.ts    | 25 ++------
 .../transcriber/ctranslate2-transcriber.ts    |  6 +-
 .../whisper/transcriber/openai-transcriber.ts |  6 +-
 .../transcriber/timestamped-transcriber.ts    |  6 +-
 7 files changed, 49 insertions(+), 117 deletions(-)
diff --git a/packages/tests/src/transcription/benchmark.spec.ts b/packages/tests/src/transcription/benchmark.spec.ts
index 9d1f55e76a2..da7630475e6 100644
--- a/packages/tests/src/transcription/benchmark.spec.ts
+++ b/packages/tests/src/transcription/benchmark.spec.ts
@@ -1,5 +1,5 @@
 import { createLogger } from 'winston'
-import short, { UUID } from 'short-uuid'
+import short, { SUUID } from 'short-uuid'
 import { performance, PerformanceObserver } from 'node:perf_hooks'
 // import { CpuInfo, CpuUsage } from 'node:os'
 import { rm, mkdir } from 'node:fs/promises'
@@ -9,16 +9,16 @@ import {
   transcriberFactory,
   TranscriptFile,
   TranscriptFileEvaluator,
-  TranscriptionEngine, TranscriptionRun
+  TranscriptionEngine
 } from '@peertube/peertube-transcription'
 
 interface TestResult {
-  uuid: string
-  WER: number
-  CER: number
-  duration: number
-  engine: TranscriptionEngine
-  model: string
+  uuid: SUUID
+  WER?: number
+  CER?: number
+  duration?: number
+  engine?: TranscriptionEngine
+  model?: string
   // dataThroughput: number // relevant ?
   // cpus: CpuInfo[] // https://nodejs.org/docs/latest-v18.x/api/os.html#oscpus
   // cpuUsages: CpuUsage[] // https://nodejs.org/docs/latest-v18.x/api/process.html#processcpuusagepreviousvalue
@@ -27,12 +27,12 @@ interface TestResult {
   // memoryUsages: Record<number, MemoryUsage> // https://nodejs.org/docs/latest-v18.x/api/process.html#processmemoryusage
 }
 
-type Benchmark = Record<UUID, Partial<TestResult>>
+type Benchmark = Record<SUUID, TestResult>
 
-const benchmarkReducer = (benchmark: Benchmark = {}, uuid: string, testResult: Partial<TestResult>) => ({
+const benchmarkReducer = (benchmark: Benchmark = {}, testResult: TestResult) => ({
   ...benchmark,
-  [uuid]:  {
-    ...benchmark[uuid],
+  [testResult.uuid]:  {
+    ...benchmark[testResult.uuid],
     ...testResult
   }
 })
@@ -42,13 +42,15 @@ interface FormattedTestResult {
   CER?: string
   duration?: string
   model?: string
+  engine?: string
 }
 
-const formatTestResult = ({ WER, CER, duration, model }: Partial<TestResult>): FormattedTestResult => ({
+const formatTestResult = ({ WER, CER, duration, engine, model }: Partial<TestResult>): FormattedTestResult => ({
   WER: WER ? `${WER * 100}%` : undefined,
   CER: CER ? `${CER * 100}%` : undefined,
   duration: duration ? toHumanReadable(duration) : undefined,
-  model
+  model,
+  engine: engine.name
 })
 
 describe('Transcribers benchmark', function () {
@@ -70,7 +72,7 @@ describe('Transcribers benchmark', function () {
     format: 'txt'
   })
 
-  let benchmark: Record<string, Partial<TestResult>> = {}
+  let benchmark: Record<string, TestResult> = {}
 
   before(async function () {
     await mkdir(transcriptDirectory, { recursive: true })
@@ -79,9 +81,8 @@ describe('Transcribers benchmark', function () {
       items
         .getEntries()
         .forEach((entry) => {
-          const { uuid } = TranscriptionRun.extractFromId(entry.name)
-
-          benchmark = benchmarkReducer(benchmark, uuid, {
+          benchmark = benchmarkReducer(benchmark, {
+            uuid: entry.name as SUUID,
             duration: entry.duration
           })
         })
@@ -99,14 +100,15 @@ describe('Transcribers benchmark', function () {
 
       models.forEach((modelName) => {
         it(`Run ${transcriberName} transcriber benchmark with ${modelName} model`, async function () {
-          this.timeout(1000000)
+          this.timeout(15 * 1000 * 60) // 15 minutes
           const model = { name: modelName }
-          const uuid = short.uuid()
+          const uuid = short.generate()
           const transcriptFile = await transcriber.transcribe(mediaFilePath, model, 'fr', 'txt', uuid)
           const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
           await new Promise(resolve => setTimeout(resolve, 1))
 
-          benchmark = benchmarkReducer(benchmark, uuid, {
+          benchmark = benchmarkReducer(benchmark, {
+            uuid,
             engine: transcriber.engine,
             WER: await evaluator.wer(),
             CER: await evaluator.cer(),
@@ -118,14 +120,16 @@ describe('Transcribers benchmark', function () {
   })
 
   after(async function () {
-    console.table(
-      Object
+    const benchmarksGroupedByModel = Object
         .keys(benchmark)
-        .reduce((formattedBenchmark, engineName, currentIndex, array) => ({
-          ...formattedBenchmark,
-          [engineName]: formatTestResult(benchmark[engineName])
+        .reduce((benchmarksGroupedByModel, uuid, currentIndex, array) => ({
+          ...benchmarksGroupedByModel,
+          [benchmark[uuid].model]: {
+            ...benchmarksGroupedByModel[benchmark[uuid].model],
+            [uuid]: formatTestResult(benchmark[uuid])
+          }
         }), {})
-    )
+    Object.values(benchmarksGroupedByModel).forEach(benchmark => console.table(benchmark))
 
     await rm(transcriptDirectory, { recursive: true, force: true })
 
diff --git a/packages/tests/src/transcription/transcription-run.spec.ts b/packages/tests/src/transcription/transcription-run.spec.ts
index d7385dba72f..d877fbded20 100644
--- a/packages/tests/src/transcription/transcription-run.spec.ts
+++ b/packages/tests/src/transcription/transcription-run.spec.ts
@@ -1,56 +1 @@
-/* eslint-disable @typescript-eslint/no-unused-expressions */
-import { expect } from 'chai'
-import { TranscriptionRun } from '@peertube/peertube-transcription'
-import { UUID } from 'short-uuid'
-
-describe('Transcription run', function () {
-  const supposedlyValidIds = [
-    'a44521d0-0fb8-4ade-8002-3385545c3318_openai-whisper_tiny',
-    'a44521d0-0fb8-4ade-8002-3385545c3318_openai-whisper_openai/tiny',
-    '0f229848-b709-4373-a49c-80dcc0d39e2a_whisper-ctranslate2_tiny'
-  ]
-
-  it(`matches the list of supposedly valid ids`, function () {
-    supposedlyValidIds.forEach((id) => {
-      expect(id.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok
-      expect(TranscriptionRun.extractFromId(id)).to.be.ok
-    })
-  })
-
-  it(`creates a valid run id`, function () {
-    const runId = TranscriptionRun.createId({
-      name: 'engine-name',
-      binary: '/bin/engine-name',
-      requirements: [],
-      type: 'binary',
-      supportedModelFormats: []
-    }, { name: 'openai/tiny' })
-
-    expect(runId.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok
-
-    const found = TranscriptionRun.RUN_ID_MASK.exec(runId)
-    expect(found[2]).to.equals('engine-name')
-    expect(found[3]).to.equals('openai/tiny')
-  })
-
-  it(`extracts information from a run id`, function () {
-    // Because it's a "Branded primitive"
-    // https://github.com/microsoft/TypeScript/wiki/FAQ#can-i-make-a-type-alias-nominal
-    const expectedUuid = 'a44521d0-0fb8-4ade-8002-3385545c3318' as UUID
-    const runId = TranscriptionRun.createId({
-      name: 'engine-name',
-      binary: '/bin/engine-name',
-      requirements: [],
-      type: 'binary',
-      supportedModelFormats: []
-    }, { name: 'openai/tiny' }, expectedUuid)
-
-    expect(runId.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok
-
-    const { uuid, engineName, modelName } = TranscriptionRun.extractFromId(runId)
-    expect(uuid).to.equals(expectedUuid)
-    expect(engineName).to.equals('engine-name')
-    expect(modelName).to.equals('openai/tiny')
-
-  })
-})
+describe('Transcription run', function () {})
diff --git a/packages/transcription/src/abstract-transcriber.ts b/packages/transcription/src/abstract-transcriber.ts
index 3cebc858bb2..cb776f4ed42 100644
--- a/packages/transcription/src/abstract-transcriber.ts
+++ b/packages/transcription/src/abstract-transcriber.ts
@@ -1,5 +1,5 @@
 import { createLogger, Logger } from 'winston'
-import short, { UUID } from 'short-uuid'
+import short, { SUUID } from 'short-uuid'
 import { join } from 'node:path'
 import { existsSync } from 'node:fs'
 import { PerformanceObserver } from 'node:perf_hooks'
@@ -30,8 +30,8 @@ export abstract class AbstractTranscriber {
     this.performanceObserver = performanceObserver
   }
 
-  createRun (model: TranscriptionModel, uuid = short.uuid()) {
-    this.run = new TranscriptionRun(this.engine, model, this.logger, uuid)
+  createRun (uuid: SUUID = short.generate()) {
+    this.run = new TranscriptionRun(this.logger, uuid)
   }
 
   startRun () {
@@ -60,6 +60,6 @@ export abstract class AbstractTranscriber {
     model: TranscriptionModel,
     language: string,
     format: TranscriptFormat,
-    runId: UUID
+    runId: SUUID
   ): Promise<TranscriptFile>
 }
diff --git a/packages/transcription/src/transcription-run.ts b/packages/transcription/src/transcription-run.ts
index c2b9337786b..6739195cda1 100644
--- a/packages/transcription/src/transcription-run.ts
+++ b/packages/transcription/src/transcription-run.ts
@@ -1,34 +1,17 @@
-import short, { UUID } from 'short-uuid'
+import short, { SUUID } from 'short-uuid'
 import { createLogger, Logger } from 'winston'
-import { TranscriptionModel } from './transcription-model.js'
-import { TranscriptionEngine } from './transcription-engine.js'
 
 export class TranscriptionRun {
-  uuid: UUID
-  engine: TranscriptionEngine
-  model: TranscriptionModel
+  uuid: SUUID
   logger: Logger
 
-  static RUN_ID_MASK = /^([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-/]+)/i
-
-  constructor (engine: TranscriptionEngine, model: TranscriptionModel, logger = createLogger(), uuid?: UUID) {
+  constructor (logger = createLogger(), uuid: SUUID = short.generate()) {
     this.uuid = uuid
-    this.engine = engine
-    this.model = model
     this.logger = logger
   }
 
-  static createId (engine: TranscriptionEngine, model: TranscriptionModel, uuid = short.uuid()) {
-    return `${uuid}_${engine.name}_${model.name}`
-  }
-
-  static extractFromId (runId: string) {
-    const [ , uuid, engineName, modelName ] = TranscriptionRun.RUN_ID_MASK.exec(runId)
-    return { uuid, engineName, modelName }
-  }
-
   get runId () {
-    return TranscriptionRun.createId(this.engine, this.model, this.uuid)
+    return this.uuid
   }
 
   start () {
diff --git a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts
index 182e5afcecd..5e64718763c 100644
--- a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts
+++ b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts
@@ -1,5 +1,5 @@
 import { $ } from 'execa'
-import short, { UUID } from 'short-uuid'
+import short, { SUUID } from 'short-uuid'
 import { join } from 'path'
 import { lstat } from 'node:fs/promises'
 import { OpenaiTranscriber } from './openai-transcriber.js'
@@ -13,7 +13,7 @@ export class Ctranslate2Transcriber extends OpenaiTranscriber {
     model: TranscriptionModel = { name: 'tiny' },
     language: string = 'en',
     format: TranscriptFormat = 'vtt',
-    runId: UUID = short.uuid()
+    runId: SUUID = short.generate()
   ): Promise<TranscriptFile> {
     // Shall we run the command with `{ shell: true }` to get the same error as in sh ?
     // ex: ENOENT => Command not found
@@ -25,7 +25,7 @@ export class Ctranslate2Transcriber extends OpenaiTranscriber {
     }
     const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
 
-    this.createRun(model, runId)
+    this.createRun(runId)
     this.startRun()
     await $$`${this.engine.binary} ${[
       mediaFilePath,
diff --git a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts
index 50629bc1c84..d9f8ce13b71 100644
--- a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts
+++ b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts
@@ -1,6 +1,6 @@
 import { join } from 'path'
 import { $ } from 'execa'
-import short, { UUID } from 'short-uuid'
+import short, { SUUID } from 'short-uuid'
 import { TranscriptionModel } from '../../transcription-model.js'
 import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
 import { AbstractTranscriber } from '../../abstract-transcriber.js'
@@ -12,14 +12,14 @@ export class OpenaiTranscriber extends AbstractTranscriber {
     model: TranscriptionModel = { name: 'tiny' },
     language: string = 'en',
     format: TranscriptFormat = 'vtt',
-    runId: UUID = short.uuid()
+    runId: SUUID = short.generate()
   ): Promise<TranscriptFile> {
     // Shall we run the command with `{ shell: true }` to get the same error as in sh ?
     // ex: ENOENT => Command not found
     const $$ = $({ verbose: true })
     const { baseName } = getFileInfo(mediaFilePath)
 
-    this.createRun(model, runId)
+    this.createRun(runId)
     this.startRun()
     await $$`${this.engine.binary} ${[
       mediaFilePath,
diff --git a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts
index bd8d65c790f..e23e3d3eeb1 100644
--- a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts
+++ b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts
@@ -1,5 +1,5 @@
 import { $ } from 'execa'
-import short, { UUID } from 'short-uuid'
+import short, { SUUID } from 'short-uuid'
 import assert from 'node:assert'
 import { join } from 'node:path'
 import { existsSync } from 'node:fs'
@@ -15,12 +15,12 @@ export class WhisperTimestampedTranscriber extends OpenaiTranscriber {
     model: TranscriptionModel,
     language: string,
     format: TranscriptFormat = 'vtt',
-    runId: UUID = short.uuid()
+    runId: SUUID = short.generate()
   ): Promise<TranscriptFile> {
     const $$ = $({ verbose: true })
     const { baseName, name } = getFileInfo(mediaFilePath)
 
-    this.createRun(model, runId)
+    this.createRun(runId)
     this.startRun()
     await $$`${this.engine.binary} ${[
       mediaFilePath,