Skip to content

Commit

Permalink
chore(transcription): somplify run object only a uuid is now needed a…
Browse files Browse the repository at this point in the history
…nd add more benchmark scenario
  • Loading branch information
lutangar committed May 2, 2024
1 parent c5eb336 commit 5e11f85
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 117 deletions.
58 changes: 31 additions & 27 deletions packages/tests/src/transcription/benchmark.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { createLogger } from 'winston'
import short, { UUID } from 'short-uuid'
import short, { SUUID } from 'short-uuid'
import { performance, PerformanceObserver } from 'node:perf_hooks'
// import { CpuInfo, CpuUsage } from 'node:os'
import { rm, mkdir } from 'node:fs/promises'
Expand All @@ -9,16 +9,16 @@ import {
transcriberFactory,
TranscriptFile,
TranscriptFileEvaluator,
TranscriptionEngine, TranscriptionRun
TranscriptionEngine
} from '@peertube/peertube-transcription'

interface TestResult {
uuid: string
WER: number
CER: number
duration: number
engine: TranscriptionEngine
model: string
uuid: SUUID
WER?: number
CER?: number
duration?: number
engine?: TranscriptionEngine
model?: string
// dataThroughput: number // relevant ?
// cpus: CpuInfo[] // https://nodejs.org/docs/latest-v18.x/api/os.html#oscpus
// cpuUsages: CpuUsage[] // https://nodejs.org/docs/latest-v18.x/api/process.html#processcpuusagepreviousvalue
Expand All @@ -27,12 +27,12 @@ interface TestResult {
// memoryUsages: Record<number, MemoryUsage> // https://nodejs.org/docs/latest-v18.x/api/process.html#processmemoryusage
}

type Benchmark = Record<UUID, Partial<TestResult>>
type Benchmark = Record<SUUID, TestResult>

const benchmarkReducer = (benchmark: Benchmark = {}, uuid: string, testResult: Partial<TestResult>) => ({
const benchmarkReducer = (benchmark: Benchmark = {}, testResult: TestResult) => ({
...benchmark,
[uuid]: {
...benchmark[uuid],
[testResult.uuid]: {
...benchmark[testResult.uuid],
...testResult
}
})
Expand All @@ -42,13 +42,15 @@ interface FormattedTestResult {
CER?: string
duration?: string
model?: string
engine?: string
}

const formatTestResult = ({ WER, CER, duration, model }: Partial<TestResult>): FormattedTestResult => ({
const formatTestResult = ({ WER, CER, duration, engine, model }: Partial<TestResult>): FormattedTestResult => ({
WER: WER ? `${WER * 100}%` : undefined,
CER: CER ? `${CER * 100}%` : undefined,
duration: duration ? toHumanReadable(duration) : undefined,
model
model,
engine: engine.name
})

describe('Transcribers benchmark', function () {
Expand All @@ -70,7 +72,7 @@ describe('Transcribers benchmark', function () {
format: 'txt'
})

let benchmark: Record<string, Partial<TestResult>> = {}
let benchmark: Record<string, TestResult> = {}

before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
Expand All @@ -79,9 +81,8 @@ describe('Transcribers benchmark', function () {
items
.getEntries()
.forEach((entry) => {
const { uuid } = TranscriptionRun.extractFromId(entry.name)

benchmark = benchmarkReducer(benchmark, uuid, {
benchmark = benchmarkReducer(benchmark, {
uuid: entry.name as SUUID,
duration: entry.duration
})
})
Expand All @@ -99,14 +100,15 @@ describe('Transcribers benchmark', function () {

models.forEach((modelName) => {
it(`Run ${transcriberName} transcriber benchmark with ${modelName} model`, async function () {
this.timeout(1000000)
this.timeout(15 * 1000 * 60) // 15 minutes
const model = { name: modelName }
const uuid = short.uuid()
const uuid = short.generate()
const transcriptFile = await transcriber.transcribe(mediaFilePath, model, 'fr', 'txt', uuid)
const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
await new Promise(resolve => setTimeout(resolve, 1))

benchmark = benchmarkReducer(benchmark, uuid, {
benchmark = benchmarkReducer(benchmark, {
uuid,
engine: transcriber.engine,
WER: await evaluator.wer(),
CER: await evaluator.cer(),
Expand All @@ -118,14 +120,16 @@ describe('Transcribers benchmark', function () {
})

after(async function () {
console.table(
Object
const benchmarksGroupedByModel = Object
.keys(benchmark)
.reduce((formattedBenchmark, engineName, currentIndex, array) => ({
...formattedBenchmark,
[engineName]: formatTestResult(benchmark[engineName])
.reduce((benchmarksGroupedByModel, uuid, currentIndex, array) => ({
...benchmarksGroupedByModel,
[benchmark[uuid].model]: {
...benchmarksGroupedByModel[benchmark[uuid].model],
[uuid]: formatTestResult(benchmark[uuid])
}
}), {})
)
Object.values(benchmarksGroupedByModel).forEach(benchmark => console.table(benchmark))

await rm(transcriptDirectory, { recursive: true, force: true })

Expand Down
57 changes: 1 addition & 56 deletions packages/tests/src/transcription/transcription-run.spec.ts
Original file line number Diff line number Diff line change
@@ -1,56 +1 @@
/* eslint-disable @typescript-eslint/no-unused-expressions */
import { expect } from 'chai'
import { TranscriptionRun } from '@peertube/peertube-transcription'
import { UUID } from 'short-uuid'

describe('Transcription run', function () {
const supposedlyValidIds = [
'a44521d0-0fb8-4ade-8002-3385545c3318_openai-whisper_tiny',
'a44521d0-0fb8-4ade-8002-3385545c3318_openai-whisper_openai/tiny',
'0f229848-b709-4373-a49c-80dcc0d39e2a_whisper-ctranslate2_tiny'
]

it(`matches the list of supposedly valid ids`, function () {
supposedlyValidIds.forEach((id) => {
expect(id.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok
expect(TranscriptionRun.extractFromId(id)).to.be.ok
})
})

it(`creates a valid run id`, function () {
const runId = TranscriptionRun.createId({
name: 'engine-name',
binary: '/bin/engine-name',
requirements: [],
type: 'binary',
supportedModelFormats: []
}, { name: 'openai/tiny' })

expect(runId.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok

const found = TranscriptionRun.RUN_ID_MASK.exec(runId)
expect(found[2]).to.equals('engine-name')
expect(found[3]).to.equals('openai/tiny')
})

it(`extracts information from a run id`, function () {
// Because it's a "Branded primitive"
// https://github.com/microsoft/TypeScript/wiki/FAQ#can-i-make-a-type-alias-nominal
const expectedUuid = 'a44521d0-0fb8-4ade-8002-3385545c3318' as UUID
const runId = TranscriptionRun.createId({
name: 'engine-name',
binary: '/bin/engine-name',
requirements: [],
type: 'binary',
supportedModelFormats: []
}, { name: 'openai/tiny' }, expectedUuid)

expect(runId.match(TranscriptionRun.RUN_ID_MASK)).to.be.ok

const { uuid, engineName, modelName } = TranscriptionRun.extractFromId(runId)
expect(uuid).to.equals(expectedUuid)
expect(engineName).to.equals('engine-name')
expect(modelName).to.equals('openai/tiny')

})
})
describe('Transcription run', function () {})
8 changes: 4 additions & 4 deletions packages/transcription/src/abstract-transcriber.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { createLogger, Logger } from 'winston'
import short, { UUID } from 'short-uuid'
import short, { SUUID } from 'short-uuid'
import { join } from 'node:path'
import { existsSync } from 'node:fs'
import { PerformanceObserver } from 'node:perf_hooks'
Expand Down Expand Up @@ -30,8 +30,8 @@ export abstract class AbstractTranscriber {
this.performanceObserver = performanceObserver
}

createRun (model: TranscriptionModel, uuid = short.uuid()) {
this.run = new TranscriptionRun(this.engine, model, this.logger, uuid)
createRun (uuid: SUUID = short.generate()) {
this.run = new TranscriptionRun(this.logger, uuid)
}

startRun () {
Expand Down Expand Up @@ -60,6 +60,6 @@ export abstract class AbstractTranscriber {
model: TranscriptionModel,
language: string,
format: TranscriptFormat,
runId: UUID
runId: SUUID
): Promise<TranscriptFile>
}
25 changes: 4 additions & 21 deletions packages/transcription/src/transcription-run.ts
Original file line number Diff line number Diff line change
@@ -1,34 +1,17 @@
import short, { UUID } from 'short-uuid'
import short, { SUUID } from 'short-uuid'
import { createLogger, Logger } from 'winston'
import { TranscriptionModel } from './transcription-model.js'
import { TranscriptionEngine } from './transcription-engine.js'

export class TranscriptionRun {
uuid: UUID
engine: TranscriptionEngine
model: TranscriptionModel
uuid: SUUID
logger: Logger

static RUN_ID_MASK = /^([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-/]+)/i

constructor (engine: TranscriptionEngine, model: TranscriptionModel, logger = createLogger(), uuid?: UUID) {
constructor (logger = createLogger(), uuid: SUUID = short.generate()) {
this.uuid = uuid
this.engine = engine
this.model = model
this.logger = logger
}

static createId (engine: TranscriptionEngine, model: TranscriptionModel, uuid = short.uuid()) {
return `${uuid}_${engine.name}_${model.name}`
}

static extractFromId (runId: string) {
const [ , uuid, engineName, modelName ] = TranscriptionRun.RUN_ID_MASK.exec(runId)
return { uuid, engineName, modelName }
}

get runId () {
return TranscriptionRun.createId(this.engine, this.model, this.uuid)
return this.uuid
}

start () {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { $ } from 'execa'
import short, { UUID } from 'short-uuid'
import short, { SUUID } from 'short-uuid'
import { join } from 'path'
import { lstat } from 'node:fs/promises'
import { OpenaiTranscriber } from './openai-transcriber.js'
Expand All @@ -13,7 +13,7 @@ export class Ctranslate2Transcriber extends OpenaiTranscriber {
model: TranscriptionModel = { name: 'tiny' },
language: string = 'en',
format: TranscriptFormat = 'vtt',
runId: UUID = short.uuid()
runId: SUUID = short.generate()
): Promise<TranscriptFile> {
// Shall we run the command with `{ shell: true }` to get the same error as in sh ?
// ex: ENOENT => Command not found
Expand All @@ -25,7 +25,7 @@ export class Ctranslate2Transcriber extends OpenaiTranscriber {
}
const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]

this.createRun(model, runId)
this.createRun(runId)
this.startRun()
await $$`${this.engine.binary} ${[
mediaFilePath,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { join } from 'path'
import { $ } from 'execa'
import short, { UUID } from 'short-uuid'
import short, { SUUID } from 'short-uuid'
import { TranscriptionModel } from '../../transcription-model.js'
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
Expand All @@ -12,14 +12,14 @@ export class OpenaiTranscriber extends AbstractTranscriber {
model: TranscriptionModel = { name: 'tiny' },
language: string = 'en',
format: TranscriptFormat = 'vtt',
runId: UUID = short.uuid()
runId: SUUID = short.generate()
): Promise<TranscriptFile> {
// Shall we run the command with `{ shell: true }` to get the same error as in sh ?
// ex: ENOENT => Command not found
const $$ = $({ verbose: true })
const { baseName } = getFileInfo(mediaFilePath)

this.createRun(model, runId)
this.createRun(runId)
this.startRun()
await $$`${this.engine.binary} ${[
mediaFilePath,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { $ } from 'execa'
import short, { UUID } from 'short-uuid'
import short, { SUUID } from 'short-uuid'
import assert from 'node:assert'
import { join } from 'node:path'
import { existsSync } from 'node:fs'
Expand All @@ -15,12 +15,12 @@ export class WhisperTimestampedTranscriber extends OpenaiTranscriber {
model: TranscriptionModel,
language: string,
format: TranscriptFormat = 'vtt',
runId: UUID = short.uuid()
runId: SUUID = short.generate()
): Promise<TranscriptFile> {
const $$ = $({ verbose: true })
const { baseName, name } = getFileInfo(mediaFilePath)

this.createRun(model, runId)
this.createRun(runId)
this.startRun()
await $$`${this.engine.binary} ${[
mediaFilePath,
Expand Down

0 comments on commit 5e11f85

Please sign in to comment.