fix(transcription): activate language detection

Forbid transcript creation without a language. Add `languageDetection` flag to an engine and some assertions. Fix an issue in `whisper-ctranslate2` : Softcatala/whisper-ctranslate2#93
larriereguichet · May 7, 2024 · 5fffa52 · 5fffa52
1 parent 67a921f
commit 5fffa52
Show file tree

Hide file tree

Showing 13 changed files with 98 additions and 75 deletions.
diff --git a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts
@@ -18,7 +18,8 @@ describe('Open AI Whisper transcriber', function () {
       requirements: [],
       type: 'binary',
       binary: 'whisper',
-      supportedModelFormats: [ 'PyTorch' ]
+      supportedModelFormats: [ 'PyTorch' ],
+      languageDetection: true
     },
     createLogger(),
     transcriptDirectory
@@ -76,12 +77,12 @@ You
   })
 
   it('May transcribe a media file using a local PyTorch model', async function () {
-    this.timeout(2 * 1000 * 60)
+    this.timeout(3 * 1000 * 60)
     await transcriber.transcribe({ mediaFilePath: frVideoPath, model: TranscriptionModel.fromPath(buildAbsoluteFixturePath('transcription/models/tiny.pt')), language: 'en' })
   })
 
   it('May transcribe a media file in french', async function () {
-    this.timeout(2 * 1000 * 60)
+    this.timeout(3 * 1000 * 60)
     const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' })
     expect(await transcript.equals(new TranscriptFile({
       path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
@@ -104,8 +105,14 @@ Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux co
     )
   })
 
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(3 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+    expect(transcript.language).to.equals('fr')
+  })
+
   it('May transcribe a media file in french with small model', async function () {
-    this.timeout(5 * 1000 * 60)
+    this.timeout(6 * 1000 * 60)
     const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt', model: new WhisperBuiltinModel('small') })
     expect(await transcript.equals(new TranscriptFile({
       path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),

diff --git a/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts
@@ -26,7 +26,8 @@ describe('Linto timestamped Whisper transcriber', function () {
       requirements: [],
       type: 'binary',
       binary: 'whisper_timestamped',
-      supportedModelFormats: [ 'PyTorch' ]
+      supportedModelFormats: [ 'PyTorch' ],
+      languageDetection: true
     },
     createLogger(),
     transcriptDirectory
@@ -84,6 +85,7 @@ you
   })
 
   it('May transcribe a media file using a local PyTorch model file', async function () {
+    this.timeout(2 * 1000 * 60)
     await transcriber.transcribe({ mediaFilePath: frVideoPath, model: TranscriptionModel.fromPath(buildAbsoluteFixturePath('transcription/models/tiny.pt')), language: 'en' })
   })
 
@@ -124,6 +126,12 @@ Ensuite, il pourront lire et commenter ce de leur camarade, ou répondre au comm
     )
   })
 
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+    expect(transcript.language).to.equals('fr')
+  })
+
   it('Should produce a text transcript similar to openai-whisper implementation', async function () {
     this.timeout(5 * 1000 * 60)
     const transcribeArgs: WhisperTranscribeArgs = {

diff --git a/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts
@@ -24,7 +24,8 @@ describe('Whisper CTranslate2 transcriber', function () {
       requirements: [],
       type: 'binary',
       binary: 'whisper-ctranslate2',
-      supportedModelFormats: []
+      supportedModelFormats: [],
+      languageDetection: true
     },
     createLogger(),
     transcriptDirectory
@@ -36,7 +37,7 @@ describe('Whisper CTranslate2 transcriber', function () {
 
   it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
     const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
-    expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.vtt') }))).to.be.true
+    expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.vtt'), language: 'en' }))).to.be.true
     expect(await readFile(transcript.path, 'utf8')).to.equal(
       `WEBVTT
 
@@ -51,7 +52,8 @@ You
     const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
     expect(await transcript.equals(new TranscriptFile({
       path: join(transcriptDirectory, 'video_short.srt'),
-      format: 'srt'
+      format: 'srt',
+      language: 'en'
     }))).to.be.true
 
     expect(await readFile(transcript.path, 'utf8')).to.equal(
@@ -67,7 +69,8 @@ You
     const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
     expect(await transcript.equals(new TranscriptFile({
       path: join(transcriptDirectory, 'video_short.txt'),
-      format: 'txt'
+      format: 'txt',
+      language: 'en'
     }))).to.be.true
 
     expect(await transcript.read()).to.equal(`You
@@ -84,7 +87,8 @@ You
     })
     expect(await transcript.equals(new TranscriptFile({
       path: join(transcriptDirectory, 'video_short.txt'),
-      format: 'txt'
+      format: 'txt',
+      language: 'en'
     }))).to.be.true
 
     expect(await transcript.read()).to.equal(`You
@@ -114,6 +118,12 @@ Ensuite, il pourront lire et commenter ce de leur camarade, on répondra au comm
     )
   })
 
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+    expect(transcript.language).to.equals('fr')
+  })
+
   it('Should produce a text transcript similar to openai-whisper implementation', async function () {
     this.timeout(5 * 1000 * 60)
     const transcribeArgs: WhisperTranscribeArgs = {

diff --git a/packages/transcription/README.md b/packages/transcription/README.md
@@ -30,7 +30,8 @@ import { OpenaiTranscriber } from '@peertube/peertube-transcription'
   // create a transcriber powered by OpeanAI Whisper CLI
   const transcriber = new OpenaiTranscriber({
     name: 'openai-whisper',
-    binary: 'whisper'
+    binary: 'whisper',
+    languageDetection: true,
   });
 
   const transcriptFile = await transcriber.transcribe({

diff --git a/packages/transcription/src/abstract-transcriber.ts b/packages/transcription/src/abstract-transcriber.ts
@@ -1,7 +1,6 @@
 import { createLogger, Logger } from 'winston'
 import short, { SUUID } from 'short-uuid'
 import { join } from 'node:path'
-import { existsSync } from 'node:fs'
 import { PerformanceObserver } from 'node:perf_hooks'
 import { root } from '@peertube/peertube-node-utils'
 import { TranscriptionEngine } from './transcription-engine.js'
@@ -51,12 +50,10 @@ export abstract class AbstractTranscriber {
     delete this.run
   }
 
-  detectLanguage () {
-    return Promise.resolve('')
-  }
-
-  loadModel (model: TranscriptionModel) {
-    if (existsSync(model.path)) { /* empty */ }
+  assertLanguageDetectionAvailable (language?: string) {
+    if (!this.engine.languageDetection && !language) {
+      throw new Error(`Language detection isn't available in ${this.engine.name}. A language must me provided explicitly.`)
+    }
   }
 
   supports (model: TranscriptionModel) {

diff --git a/packages/transcription/src/file-utils.ts b/packages/transcription/src/file-utils.ts
diff --git a/packages/transcription/src/transcript/transcript-file-interface.ts b/packages/transcription/src/transcript/transcript-file-interface.ts
@@ -1,3 +1,3 @@
-export type TranscriptFormat = 'txt' | 'vtt' | 'srt'
+export type TranscriptFormat = 'txt' | 'vtt' | 'srt' | 'json'
 
 export type TranscriptFileInterface = { path: string, language?: string, format: TranscriptFormat }
diff --git a/packages/transcription/src/transcript/transcript-file.ts b/packages/transcription/src/transcript/transcript-file.ts
@@ -5,10 +5,10 @@ import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
 
 export class TranscriptFile implements TranscriptFileInterface {
   path: string
-  language: string = 'en'
+  language: string
   format: TranscriptFormat = 'vtt'
 
-  constructor ({ path, language = 'en', format = 'vtt' }: { path: string, language?: string, format?: TranscriptFormat }) {
+  constructor ({ path, language, format = 'vtt' }: { path: string, language: string, format?: TranscriptFormat }) {
     statSync(path)
 
     this.path = path

diff --git a/packages/transcription/src/transcription-engine.ts b/packages/transcription/src/transcription-engine.ts
@@ -13,6 +13,7 @@ export class TranscriptionEngine {
   license?: string
   forgeURL?: string
   supportedModelFormats: ModelFormat[]
+  languageDetection?: true
   // There could be a default models.
   // There could be a list of default models
 

diff --git a/packages/transcription/src/whisper/engines.ts b/packages/transcription/src/whisper/engines.ts
@@ -12,16 +12,6 @@ export const engines: TranscriptionEngine[] = [
     license : 'MIT',
     supportedModelFormats: [ 'ONNX' ]
   },
-  // {
-  //   name : 'transformers',
-  //   description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
-  //   type: 'binary',
-  //   language : 'python',
-  //   requirements : [],
-  //   forgeURL : '',
-  //   license : '',
-  //   supportedModelFormats: [ 'ONNX' ]
-  // },
   {
     name: 'openai-whisper',
     description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
@@ -31,7 +21,8 @@ export const engines: TranscriptionEngine[] = [
     binary: 'whisper',
     forgeURL: 'https://github.com/openai/whisper',
     license: 'MIT',
-    supportedModelFormats: [ 'PyTorch' ]
+    supportedModelFormats: [ 'PyTorch' ],
+    languageDetection: true
   },
   {
     name: 'whisper-ctranslate2',
@@ -42,7 +33,8 @@ export const engines: TranscriptionEngine[] = [
     binary: 'whisper-ctranslate2',
     forgeURL: 'https://github.com/openai/whisper',
     license: 'MIT',
-    supportedModelFormats: [ 'CTranslate2' ]
+    supportedModelFormats: [ 'CTranslate2' ],
+    languageDetection: true
   },
   {
     name: 'whisper-timestamped',
@@ -53,6 +45,7 @@ export const engines: TranscriptionEngine[] = [
     binary: 'whisper_timestamped',
     forgeURL: 'https://github.com/openai/whisper',
     license: 'MIT',
-    supportedModelFormats: [ 'CTranslate2' ]
+    supportedModelFormats: [ 'CTranslate2' ],
+    languageDetection: true
   }
 ]
diff --git a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts
@@ -1,10 +1,8 @@
 import { $ } from 'execa'
 import short from 'short-uuid'
-import { join } from 'path'
 import { lstat } from 'node:fs/promises'
 import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js'
 import { TranscriptFile } from '../../transcript/index.js'
-import { getFileInfo } from '../../file-utils.js'
 import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
 
 export class Ctranslate2Transcriber extends OpenaiTranscriber {
@@ -15,34 +13,35 @@ export class Ctranslate2Transcriber extends OpenaiTranscriber {
     format = 'vtt',
     runId = short.generate()
   }: WhisperTranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
     // Shall we run the command with `{ shell: true }` to get the same error as in sh ?
     // ex: ENOENT => Command not found
     const $$ = $({ verbose: true })
-    const { baseName } = getFileInfo(mediaFilePath)
 
     if (model.path) {
       await lstat(model.path).then(stats => stats.isDirectory())
     }
 
-    const modelArg = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
-    const languageArg = language ? [ '--language', language ] : []
+    const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
+    const languageArgs = language ? [ '--language', language ] : []
 
     this.createRun(runId)
     this.startRun()
     await $$`${this.engine.binary} ${[
       mediaFilePath,
-      ...modelArg,
+      ...modelArgs,
       '--output_format',
-      format,
+      'all',
       '--output_dir',
       this.transcriptDirectory,
-      ...languageArg
+      ...languageArgs
     ]}`
     this.stopRun()
 
     return new TranscriptFile({
-      language,
-      path: join(this.transcriptDirectory, `${baseName}.${format}`),
+      language: language || await this.getDetectedLanguage(mediaFilePath),
+      path: this.getTranscriptFilePath(mediaFilePath, format),
       format
     })
   }

diff --git a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts
@@ -1,11 +1,12 @@
 import { join } from 'path'
 import { $ } from 'execa'
 import short from 'short-uuid'
-import { TranscriptFile } from '../../transcript/index.js'
+import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
 import { AbstractTranscriber, TranscribeArgs } from '../../abstract-transcriber.js'
-import { getFileInfo } from '../../file-utils.js'
 import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
 import { TranscriptionModel } from '../../transcription-model.js'
+import { readFile } from 'node:fs/promises'
+import { parse } from 'node:path'
 
 export type WhisperTranscribeArgs = Omit<TranscribeArgs, 'model'> & { model?: TranscriptionModel }
 
@@ -17,11 +18,12 @@ export class OpenaiTranscriber extends AbstractTranscriber {
     format = 'vtt',
     runId = short.generate()
   }: WhisperTranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
     // Shall we run the command with `{ shell: true }` to get the same error as in sh ?
     // ex: ENOENT => Command not found
     const $$ = $({ verbose: true })
-    const { baseName } = getFileInfo(mediaFilePath)
-    const languageArg = language ? [ '--language', language ] : []
+    const languageArgs = language ? [ '--language', language ] : []
 
     this.createRun(runId)
     this.startRun()
@@ -30,17 +32,31 @@ export class OpenaiTranscriber extends AbstractTranscriber {
       '--model',
       model?.path || model.name,
       '--output_format',
-      format,
+      'all',
       '--output_dir',
       this.transcriptDirectory,
-      ...languageArg
+      ...languageArgs
     ]}`
     this.stopRun()
 
     return new TranscriptFile({
-      language,
-      path: join(this.transcriptDirectory, `${baseName}.${format}`),
+      language: language || await this.getDetectedLanguage(mediaFilePath),
+      path: this.getTranscriptFilePath(mediaFilePath, format),
       format
     })
   }
+
+  async getDetectedLanguage (mediaFilePath: string) {
+    const { language } = await this.readJsonTranscriptFile(mediaFilePath)
+
+    return language
+  }
+
+  async readJsonTranscriptFile (mediaFilePath: string) {
+    return JSON.parse(await readFile(this.getTranscriptFilePath(mediaFilePath, 'json'), 'utf8'))
+  }
+
+  getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat) {
+    return join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
+  }
 }