diff --git a/packages/tests/src/transcription/benchmark.spec.ts b/packages/tests/src/transcription/benchmark.spec.ts index e93414deb79..b9f8a1ab0e4 100644 --- a/packages/tests/src/transcription/benchmark.spec.ts +++ b/packages/tests/src/transcription/benchmark.spec.ts @@ -9,7 +9,8 @@ import { transcriberFactory, TranscriptFile, TranscriptFileEvaluator, - TranscriptionEngine + TranscriptionEngine, + TranscriptionModel } from '@peertube/peertube-transcription' interface TestResult { @@ -101,9 +102,15 @@ describe('Transcribers benchmark', function () { models.forEach((modelName) => { it(`Run ${transcriberName} transcriber benchmark with ${modelName} model`, async function () { this.timeout(15 * 1000 * 60) // 15 minutes - const model = { name: modelName } + const model = new TranscriptionModel(modelName) const uuid = short.generate() - const transcriptFile = await transcriber.transcribe(mediaFilePath, model, 'fr', 'txt', uuid) + const transcriptFile = await transcriber.transcribe({ + mediaFilePath, + model, + language: 'fr', + format: 'txt', + runId: uuid + }) const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile) await new Promise(resolve => setTimeout(resolve, 1)) diff --git a/packages/tests/src/transcription/transcript/transcript-evaluator.spec.ts b/packages/tests/src/transcription/transcript/transcript-evaluator.spec.ts index a5db1e778d3..56a72033043 100644 --- a/packages/tests/src/transcription/transcript/transcript-evaluator.spec.ts +++ b/packages/tests/src/transcription/transcript/transcript-evaluator.spec.ts @@ -7,7 +7,7 @@ import { expect } from 'chai' describe('Transcript File Evaluator', function () { const transcriptDirectory = buildAbsoluteFixturePath('transcription/transcript-evaluator') - const referenceTranscriptFilepath = buildAbsoluteFixturePath('transcription/transcript/reference.txt') + const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/transcript/reference.txt') before(async function () { await mkdir(transcriptDirectory, { recursive: true }) @@ -29,7 +29,7 @@ describe('Transcript File Evaluator', function () { it(`evaluation must return coherent wer & cer`, async function () { const reference = new TranscriptFile({ - path: referenceTranscriptFilepath, + path: referenceTranscriptFilePath, language: 'fr', format: 'txt' }) diff --git a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts index e07d2297fed..475ec3744f5 100644 --- a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts +++ b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts @@ -4,7 +4,7 @@ import { createLogger } from 'winston' import { join } from 'path' import { mkdir, rm } from 'node:fs/promises' import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' -import { OpenaiTranscriber, TranscriptFile } from '@peertube/peertube-transcription' +import { OpenaiTranscriber, TranscriptFile, TranscriptionModel, WhisperBuiltinModel } from '@peertube/peertube-transcription' config.truncateThreshold = 0 @@ -12,7 +12,6 @@ describe('Open AI Whisper transcriber', function () { const transcriptDirectory = join(root(), 'test-transcript') const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4') const frVideoPath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4') - const transcriber = new OpenaiTranscriber( { name: 'openai-whisper', @@ -30,7 +29,7 @@ describe('Open AI Whisper transcriber', function () { }) it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () { - const transcript = await transcriber.transcribe(shortVideoPath) + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.vtt'), language: 'en', @@ -48,7 +47,7 @@ You }) it('May produce a transcript file in the `srt` format', async function () { - const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt') + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.srt'), language: 'en', @@ -65,7 +64,7 @@ You }) it('May produce a transcript file in the `txt` format', async function () { - const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt') + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.txt'), language: 'en', @@ -77,12 +76,13 @@ You }) it('May transcribe a media file using a local PyTorch model', async function () { - await transcriber.transcribe(frVideoPath, { name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/tiny.pt') }, 'fr') + this.timeout(2 * 1000 * 60) + await transcriber.transcribe({ mediaFilePath: frVideoPath, model: TranscriptionModel.fromPath(buildAbsoluteFixturePath('transcription/models/tiny.pt')), language: 'en' }) }) it('May transcribe a media file in french', async function () { - this.timeout(45000) - const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt') + this.timeout(2 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'), language: 'fr', @@ -105,8 +105,8 @@ Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux co }) it('May transcribe a media file in french with small model', async function () { - this.timeout(400000) - const transcript = await transcriber.transcribe(frVideoPath, { name: 'small' }, 'fr', 'txt') + this.timeout(5 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt', model: new WhisperBuiltinModel('small') }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'), language: 'fr', diff --git a/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts index e8953e017f5..17f29f44da2 100644 --- a/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts +++ b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts @@ -4,7 +4,15 @@ import { createLogger } from 'winston' import { join } from 'path' import { mkdir, rm } from 'node:fs/promises' import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' -import { OpenaiTranscriber, WhisperTimestampedTranscriber, TranscriptFile, TranscriptFileEvaluator } from '@peertube/peertube-transcription' +import { + OpenaiTranscriber, + WhisperTimestampedTranscriber, + TranscriptFile, + TranscriptFileEvaluator, + TranscriptionModel, + WhisperTranscribeArgs, + WhisperBuiltinModel +} from '@peertube/peertube-transcription' config.truncateThreshold = 0 @@ -29,15 +37,10 @@ describe('Linto timestamped Whisper transcriber', function () { }) it('Should transcribe a media file and produce a transcript file in `vtt` with a ms precision', async function () { - const transcript = await transcriber.transcribe( - shortVideoPath, - { name: 'tiny' }, - 'fr' - ) - + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.vtt'), - language: 'fr', + language: 'en', format: 'vtt' }))).to.be.true @@ -52,7 +55,7 @@ you }) it('May produce a transcript file in the `srt` format with a ms precision', async function () { - const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt') + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.srt'), language: 'en', @@ -69,7 +72,7 @@ you }) it('May produce a transcript file in `txt` format', async function () { - const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt') + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.txt'), language: 'en', @@ -81,12 +84,17 @@ you }) it('May transcribe a media file using a local PyTorch model file', async function () { - await transcriber.transcribe(frVideoPath, { name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/tiny.pt') }, 'fr') + await transcriber.transcribe({ mediaFilePath: frVideoPath, model: TranscriptionModel.fromPath(buildAbsoluteFixturePath('transcription/models/tiny.pt')), language: 'en' }) }) it('May transcribe a media file in french', async function () { - this.timeout(45000) - const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt') + this.timeout(2 * 1000 * 60) + const transcript = await transcriber.transcribe({ + mediaFilePath: frVideoPath, + language: 'fr', + format: 'txt', + model: new WhisperBuiltinModel('tiny') + }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'), language: 'fr', @@ -118,13 +126,13 @@ Ensuite, il pourront lire et commenter ce de leur camarade, ou répondre au comm it('Should produce a text transcript similar to openai-whisper implementation', async function () { this.timeout(5 * 1000 * 60) - const transcribeArguments: Parameters = [ - frVideoPath, - { name: 'tiny' }, - 'fr', - 'txt' - ] - const transcript = await transcriber.transcribe(...transcribeArguments) + const transcribeArgs: WhisperTranscribeArgs = { + mediaFilePath: frVideoPath, + model: TranscriptionModel.fromPath(buildAbsoluteFixturePath('transcription/models/tiny.pt')), + language: 'fr', + format: 'txt' + } + const transcript = await transcriber.transcribe(transcribeArgs) const openaiTranscriber = new OpenaiTranscriber( { @@ -137,7 +145,7 @@ Ensuite, il pourront lire et commenter ce de leur camarade, ou répondre au comm createLogger(), join(transcriptDirectory, 'openai-whisper') ) - const openaiTranscript = await openaiTranscriber.transcribe(...transcribeArguments) + const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs) const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript) expect(await transcriptFileEvaluator.wer()).to.be.below(25 / 100) diff --git a/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts index 31aef5d9ba0..ed19e3840d1 100644 --- a/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts +++ b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts @@ -4,7 +4,13 @@ import { createLogger } from 'winston' import { join } from 'path' import { mkdir, readFile, rm } from 'node:fs/promises' import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' -import { Ctranslate2Transcriber, OpenaiTranscriber, TranscriptFile, TranscriptFileEvaluator } from '@peertube/peertube-transcription' +import { + Ctranslate2Transcriber, + OpenaiTranscriber, + TranscriptFile, + TranscriptFileEvaluator, + TranscriptionModel, WhisperTranscribeArgs +} from '@peertube/peertube-transcription' config.truncateThreshold = 0 @@ -29,7 +35,7 @@ describe('Whisper CTranslate2 transcriber', function () { }) it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () { - const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }) + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.vtt') }))).to.be.true expect(await readFile(transcript.path, 'utf8')).to.equal( `WEBVTT @@ -42,7 +48,7 @@ You }) it('May produce a transcript file in the `srt` format', async function () { - const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt') + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.srt'), format: 'srt' @@ -58,7 +64,7 @@ You }) it('May produce a transcript file in the `txt` format', async function () { - const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt') + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.txt'), format: 'txt' @@ -69,12 +75,13 @@ You }) it('May transcribe a media file using a local CTranslate2 model', async function () { - const transcript = await transcriber.transcribe( - shortVideoPath, - { name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/faster-whisper-tiny') }, - 'en', - 'txt' - ) + this.timeout(2 * 1000 * 60) + const transcript = await transcriber.transcribe({ + mediaFilePath: shortVideoPath, + model: TranscriptionModel.fromPath(buildAbsoluteFixturePath('transcription/models/faster-whisper-tiny')), + language: 'en', + format: 'txt' + }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.txt'), format: 'txt' @@ -85,8 +92,8 @@ You }) it('May transcribe a media file in french', async function () { - this.timeout(45000) - const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt') + this.timeout(5 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' }) expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'), language: 'fr', @@ -109,13 +116,13 @@ Ensuite, il pourront lire et commenter ce de leur camarade, on répondra au comm it('Should produce a text transcript similar to openai-whisper implementation', async function () { this.timeout(5 * 1000 * 60) - const transcribeArguments: Parameters = [ - frVideoPath, - { name: 'tiny' }, - 'fr', - 'txt' - ] - const transcript = await transcriber.transcribe(...transcribeArguments) + const transcribeArgs: WhisperTranscribeArgs = { + mediaFilePath: frVideoPath, + model: TranscriptionModel.fromPath(buildAbsoluteFixturePath('transcription/models/tiny.pt')), + language: 'fr', + format: 'txt' + } + const transcript = await transcriber.transcribe(transcribeArgs) const openaiTranscriber = new OpenaiTranscriber( { name: 'openai-whisper', @@ -127,7 +134,7 @@ Ensuite, il pourront lire et commenter ce de leur camarade, on répondra au comm createLogger(), join(transcriptDirectory, 'openai-whisper') ) - const openaiTranscript = await openaiTranscriber.transcribe(...transcribeArguments) + const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs) const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript) expect(await transcriptFileEvaluator.wer()).to.be.below(20 / 100) diff --git a/packages/transcription/README.md b/packages/transcription/README.md index 17553afb537..076e313f149 100644 --- a/packages/transcription/README.md +++ b/packages/transcription/README.md @@ -33,11 +33,10 @@ import { OpenaiTranscriber } from '@peertube/peertube-transcription' binary: 'whisper' }); - const transcriptFile = await transcriber.transcribe( - './myVideo.mp4', - { name: 'tiny' }, - 'en', 'txt' - ); + const transcriptFile = await transcriber.transcribe({ + mediaFilePath: './myVideo.mp4', + format: 'txt' + }); console.log(transcriptFile.path); console.log(await transcriptFile.read()); @@ -45,12 +44,15 @@ import { OpenaiTranscriber } from '@peertube/peertube-transcription' ``` Using a local model file: + ```typescript - const transcriptFile = await transcriber.transcribe( - './myVideo.mp4', - { name: 'my fine tuned large model', path: './models/large.pt' }, - 'en', 'txt' - ); +import { WhisperBuiltinModel } from '@peertube/peertube-transcription/dist' + +const transcriptFile = await transcriber.transcribe({ + mediaFilePath: './myVideo.mp4', + model: WhisperBuiltinModel.fromPath('./models/large.pt'), + format: 'txt' +}); ``` You may use the builtin Factory if you're happy with the default configuration: diff --git a/packages/transcription/src/abstract-transcriber.ts b/packages/transcription/src/abstract-transcriber.ts index cb776f4ed42..3556fd9f0a0 100644 --- a/packages/transcription/src/abstract-transcriber.ts +++ b/packages/transcription/src/abstract-transcriber.ts @@ -9,6 +9,14 @@ import { TranscriptionModel } from './transcription-model.js' import { TranscriptionRun } from './transcription-run.js' import { TranscriptFile, TranscriptFormat } from './transcript/index.js' +export interface TranscribeArgs { + mediaFilePath: string + model: TranscriptionModel + language?: string + format?: TranscriptFormat + runId?: SUUID +} + export abstract class AbstractTranscriber { public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts') @@ -55,11 +63,11 @@ export abstract class AbstractTranscriber { return model.format === 'PyTorch' } - abstract transcribe ( - mediaFilePath: string, - model: TranscriptionModel, - language: string, - format: TranscriptFormat, - runId: SUUID - ): Promise + abstract transcribe ({ + mediaFilePath, + model, + language, + format = 'vtt', + runId = short.generate() + }: TranscribeArgs): Promise } diff --git a/packages/transcription/src/transcript/transcript-file-evaluator.ts b/packages/transcription/src/transcript/transcript-file-evaluator.ts index ed5fba648c4..1262cddaf61 100644 --- a/packages/transcription/src/transcript/transcript-file-evaluator.ts +++ b/packages/transcription/src/transcript/transcript-file-evaluator.ts @@ -21,12 +21,12 @@ export class TranscriptFileEvaluator { this.hypothesisTranscriptFile = hypothesisTranscriptFile } - static buildArgs (referenceTranscriptFilepath: string, hypothesisTranscriptFilepath: string, ...args: string[]) { + static buildArgs (referenceTranscriptFilePath: string, hypothesisTranscriptFilePath: string, ...args: string[]) { return [ '--reference', - referenceTranscriptFilepath, + referenceTranscriptFilePath, '--hypothesis', - hypothesisTranscriptFilepath, + hypothesisTranscriptFilePath, ...args ] } diff --git a/packages/transcription/src/transcript/transcript-file.ts b/packages/transcription/src/transcript/transcript-file.ts index a8b7a6dae6c..fc05593cc4b 100644 --- a/packages/transcription/src/transcript/transcript-file.ts +++ b/packages/transcription/src/transcript/transcript-file.ts @@ -39,6 +39,10 @@ export class TranscriptFile implements TranscriptFileInterface { } async equals (transcript: TranscriptFile, caseSensitive: boolean = true) { + if (this.language !== transcript.language) { + return false + } + const content = await this.read() const transcriptContent = await transcript.read() diff --git a/packages/transcription/src/transcription-model.ts b/packages/transcription/src/transcription-model.ts index e9219d133c8..830dd5dbf22 100644 --- a/packages/transcription/src/transcription-model.ts +++ b/packages/transcription/src/transcription-model.ts @@ -1,10 +1,13 @@ +import assert from 'node:assert' +import { existsSync } from 'node:fs' +import { parse } from 'node:path' + export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark -export abstract class TranscriptionModel { +export class TranscriptionModel { name: string format?: ModelFormat path?: string - url?: string // # - hparams // # - Number of dimensions (int) @@ -16,4 +19,16 @@ export abstract class TranscriptionModel { // # - mel filters // # - tokenizer vocab // # - model variables + + constructor (name: string, path?: string, format?: ModelFormat) { + this.name = name + this.path = path + this.format = format + } + + static fromPath (path: string) { + assert(existsSync(path), `${path} doesn't exist.`) + + return new TranscriptionModel(parse(path).name, path) + } } diff --git a/packages/transcription/src/whisper/index.ts b/packages/transcription/src/whisper/index.ts index ba4581d7f75..ee9cae725fa 100644 --- a/packages/transcription/src/whisper/index.ts +++ b/packages/transcription/src/whisper/index.ts @@ -1,2 +1,3 @@ export * from './transcriber/index.js' export * from './engines.js' +export * from './whisper-builtin-model.js' diff --git a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts index 5e64718763c..b90c2bba785 100644 --- a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts @@ -1,20 +1,20 @@ import { $ } from 'execa' -import short, { SUUID } from 'short-uuid' +import short from 'short-uuid' import { join } from 'path' import { lstat } from 'node:fs/promises' -import { OpenaiTranscriber } from './openai-transcriber.js' -import { TranscriptionModel } from '../../transcription-model.js' -import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' +import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js' +import { TranscriptFile } from '../../transcript/index.js' import { getFileInfo } from '../../file-utils.js' +import { WhisperBuiltinModel } from '../whisper-builtin-model.js' export class Ctranslate2Transcriber extends OpenaiTranscriber { - async transcribe ( - mediaFilePath: string, - model: TranscriptionModel = { name: 'tiny' }, - language: string = 'en', - format: TranscriptFormat = 'vtt', - runId: SUUID = short.generate() - ): Promise { + async transcribe ({ + mediaFilePath, + model = new WhisperBuiltinModel('tiny'), + language, + format = 'vtt', + runId = short.generate() + }: WhisperTranscribeArgs): Promise { // Shall we run the command with `{ shell: true }` to get the same error as in sh ? // ex: ENOENT => Command not found const $$ = $({ verbose: true }) @@ -23,19 +23,20 @@ export class Ctranslate2Transcriber extends OpenaiTranscriber { if (model.path) { await lstat(model.path).then(stats => stats.isDirectory()) } - const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ] + + const modelArg = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ] + const languageArg = language ? [ '--language', language ] : [] this.createRun(runId) this.startRun() await $$`${this.engine.binary} ${[ mediaFilePath, - ...modelArgs, + ...modelArg, '--output_format', format, '--output_dir', this.transcriptDirectory, - '--language', - language + ...languageArg ]}` this.stopRun() diff --git a/packages/transcription/src/whisper/transcriber/index.ts b/packages/transcription/src/whisper/transcriber/index.ts index b1d11724259..950c39b0731 100644 --- a/packages/transcription/src/whisper/transcriber/index.ts +++ b/packages/transcription/src/whisper/transcriber/index.ts @@ -1,5 +1,3 @@ export * from './ctranslate2-transcriber.js' -export * from './transformers-js-transcriber.js' -export * from './transformers-transcriber.js' export * from './openai-transcriber.js' export * from './timestamped-transcriber.js' diff --git a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts index d9f8ce13b71..b6be87227da 100644 --- a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts @@ -1,23 +1,27 @@ import { join } from 'path' import { $ } from 'execa' -import short, { SUUID } from 'short-uuid' -import { TranscriptionModel } from '../../transcription-model.js' -import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' -import { AbstractTranscriber } from '../../abstract-transcriber.js' +import short from 'short-uuid' +import { TranscriptFile } from '../../transcript/index.js' +import { AbstractTranscriber, TranscribeArgs } from '../../abstract-transcriber.js' import { getFileInfo } from '../../file-utils.js' +import { WhisperBuiltinModel } from '../whisper-builtin-model.js' +import { TranscriptionModel } from '../../transcription-model.js' + +export type WhisperTranscribeArgs = Omit & { model?: TranscriptionModel } export class OpenaiTranscriber extends AbstractTranscriber { - async transcribe ( - mediaFilePath: string, - model: TranscriptionModel = { name: 'tiny' }, - language: string = 'en', - format: TranscriptFormat = 'vtt', - runId: SUUID = short.generate() - ): Promise { + async transcribe ({ + mediaFilePath, + model = new WhisperBuiltinModel('tiny'), + language, + format = 'vtt', + runId = short.generate() + }: WhisperTranscribeArgs): Promise { // Shall we run the command with `{ shell: true }` to get the same error as in sh ? // ex: ENOENT => Command not found const $$ = $({ verbose: true }) const { baseName } = getFileInfo(mediaFilePath) + const languageArg = language ? [ '--language', language ] : [] this.createRun(runId) this.startRun() @@ -29,8 +33,7 @@ export class OpenaiTranscriber extends AbstractTranscriber { format, '--output_dir', this.transcriptDirectory, - '--language', - language + ...languageArg ]}` this.stopRun() diff --git a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts index e23e3d3eeb1..de2f760127a 100644 --- a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts @@ -1,24 +1,25 @@ import { $ } from 'execa' -import short, { SUUID } from 'short-uuid' +import short from 'short-uuid' import assert from 'node:assert' import { join } from 'node:path' import { existsSync } from 'node:fs' import { rename } from 'node:fs/promises' -import { TranscriptionModel } from '../../transcription-model.js' -import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' +import { TranscriptFile } from '../../transcript/index.js' import { getFileInfo } from '../../file-utils.js' -import { OpenaiTranscriber } from './openai-transcriber.js' +import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js' +import { WhisperBuiltinModel } from '../whisper-builtin-model.js' export class WhisperTimestampedTranscriber extends OpenaiTranscriber { - async transcribe ( - mediaFilePath: string, - model: TranscriptionModel, - language: string, - format: TranscriptFormat = 'vtt', - runId: SUUID = short.generate() - ): Promise { + async transcribe ({ + mediaFilePath, + model = new WhisperBuiltinModel('tiny'), + language, + format = 'vtt', + runId = short.generate() + }: WhisperTranscribeArgs): Promise { const $$ = $({ verbose: true }) const { baseName, name } = getFileInfo(mediaFilePath) + const languageArg = language ? [ '--language', language ] : [] this.createRun(runId) this.startRun() @@ -29,7 +30,8 @@ export class WhisperTimestampedTranscriber extends OpenaiTranscriber { '--output_format', 'all', '--output_dir', - this.transcriptDirectory + this.transcriptDirectory, + ...languageArg ]}` this.stopRun() diff --git a/packages/transcription/src/whisper/transcriber/transformers-js-transcriber.ts b/packages/transcription/src/whisper/transcriber/transformers-js-transcriber.ts deleted file mode 100644 index d06e971a52a..00000000000 --- a/packages/transcription/src/whisper/transcriber/transformers-js-transcriber.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { TranscriptionModel } from '../../transcription-model.js' -import { AbstractTranscriber } from '../../abstract-transcriber.js' -import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' - -// Disable local models -// env.allowLocalModels = true - -export class TransformersJsTranscriber extends AbstractTranscriber { - async transcribe ( - mediaFilePath: string, - model: TranscriptionModel, - language: string, - format: TranscriptFormat = 'vtt' - ): Promise { - return Promise.resolve(undefined) - // return pipeline('automatic-speech-recognition', 'no_attentions', { - // // For medium models, we need to load the `no_attentions` revision to avoid running out of memory - // revision: [].includes('/whisper-medium') ? 'no_attentions' : 'main' - // }) - } -} diff --git a/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts b/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts deleted file mode 100644 index 71a53f92318..00000000000 --- a/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { TranscriptionModel } from '../../transcription-model.js' -import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' -import { AbstractTranscriber } from '../../abstract-transcriber.js' -import { $ } from 'execa' -import { join } from 'path' - -export class TransformersTranscriber extends AbstractTranscriber { - async transcribe ( - mediaFilePath: string, - model: TranscriptionModel, - language: string, - format: TranscriptFormat = 'vtt' - ): Promise { - const $$ = $({ verbose: true }) - // const ffmpegChildProcess = $$`ffmpeg ${[ - // '-i', - // mediaFilePath, - // '-vn', // no video - // '-ar', - // 16000, // set the audio sampling frequency - // '-ac', - // '1', // set the number of audio channels to 1 since Vosk is expecting mono - // '-bufsize', - // 1000, // set a buffer size to provide a steady flow of frames - // '-' - // ]}` - - await $$`transformers-cli ${[ - '--task', - 'automatic-speech-recognition', - '--model', - 'openai/whisper-tiny', - '--input', - mediaFilePath - ]}` - - return new TranscriptFile({ - language, - path: join(this.transcriptDirectory, `test.${format}`), - format - }) - } -} diff --git a/packages/transcription/src/whisper/whisper-builtin-model.ts b/packages/transcription/src/whisper/whisper-builtin-model.ts new file mode 100644 index 00000000000..32981ad2030 --- /dev/null +++ b/packages/transcription/src/whisper/whisper-builtin-model.ts @@ -0,0 +1,11 @@ +import { TranscriptionModel } from '../transcription-model.js' + +export type WhisperBuiltinModelName = 'tiny' | 'base' | 'small' | 'medium' | 'large' | 'large-v2' | 'large-v3' + +export class WhisperBuiltinModel extends TranscriptionModel { + + // eslint-disable-next-line @typescript-eslint/no-useless-constructor + constructor (name: WhisperBuiltinModelName) { + super(name) + } +}