Skip to content

Commit

Permalink
chore(transcription): add proper english video fixture
Browse files Browse the repository at this point in the history
  • Loading branch information
lutangar committed May 15, 2024
1 parent dd3c112 commit 70f68b9
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 42 deletions.
6 changes: 0 additions & 6 deletions packages/tests/fixtures/transcription/README.md

This file was deleted.

16 changes: 16 additions & 0 deletions packages/tests/fixtures/transcription/videos/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
🇫🇷 DRANE Occitanie - Communiquer lors d'une classe transplantée
[./communiquer-lors-dune-classe-transplantee.mp4](communiquer-lors-dune-classe-transplantee.mp4)
> https://podeduc.apps.education.fr/numerique-educatif/video/21893-communiquer-lors-dune-classe-transplantee/
>
> CC BY-NC-SA 4.0 Deed
> Attribution-NonCommercial-ShareAlike 4.0 International
🇫🇷 [Accompagner la victime d'une dérive sectaire ou d'une emprise mentale](https://www.fun-mooc.fr/fr/cours/accompagner-la-victime-de-derive-sectaire/)
> Centre Contre les Manipulations Mentales (CCMM)
> [CC BY-NC-ND 4.0 Deed](https://creativecommons.org/licenses/by-nc-nd/4.0/)
> Attribution-NonCommercial-NoDerivs 4.0 International
🇺🇸 [The Last Man On Earth (1964)](https://archive.org/details/TheLastManOnEarthHD)
> PDM 1.0 Deed
> Public Domain Mark 1.0 Universal
> https://creativecommons.org/publicdomain/mark/1.0/
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
December, 1965.
Is that all it has been since
I inherited the world?
Only three years.
It seems like a hundred million.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* eslint-disable @typescript-eslint/no-unused-expressions */
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
import { expect, config } from 'chai'
import { createLogger } from 'winston'
import { join } from 'path'
Expand All @@ -16,7 +16,7 @@ config.truncateThreshold = 0

describe('Open AI Whisper transcriber', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
const referenceTranscriptFile = new TranscriptFile({
path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
Expand All @@ -43,16 +43,16 @@ describe('Open AI Whisper transcriber', function () {
it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.vtt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.vtt'),
language: 'en',
format: 'vtt'
}))).to.be.true

expect(await transcript.read()).to.equals(
`WEBVTT
00:00.000 --> 00:02.000
You
00:00.000 --> 00:13.000
December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.
`
)
Expand All @@ -61,15 +61,15 @@ You
it('May produce a transcript file in the `srt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.srt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.srt'),
language: 'en',
format: 'srt'
}))).to.be.true

expect(await transcript.read()).to.equal(
`1
00:00:00,000 --> 00:00:02,000
You
00:00:00,000 --> 00:00:13,000
December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.
`
)
Expand All @@ -78,13 +78,14 @@ You
it('May produce a transcript file in the `txt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.txt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.txt'),
language: 'en',
format: 'txt'
}))).to.be.true

expect(await transcript.read()).to.equal(`You
`)
expect(await transcript.read()).to.equal(`December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.
`
)
})

it('May transcribe a media file using a local PyTorch model', async function () {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,8 @@ config.truncateThreshold = 0

describe('Linto timestamped Whisper transcriber', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
const referenceTranscriptFile = new TranscriptFile({
path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
language: 'fr',
format: 'txt'
})
const transcriber = new WhisperTimestampedTranscriber(
{
name: 'whisper-timestamped',
Expand All @@ -45,16 +40,22 @@ describe('Linto timestamped Whisper transcriber', function () {
it('Should transcribe a media file and produce a transcript file in `vtt` with a ms precision', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.vtt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.vtt'),
language: 'en',
format: 'vtt'
}))).to.be.true

expect(await transcript.read()).to.equals(
`WEBVTT
00:02.480 --> 00:02.500
you
00:00.460 --> 00:02.080
December 1965.
00:03.700 --> 00:08.800
Is that all it has been since I inherited the world only three years?
00:10.420 --> 00:11.900
Seems like a hundred million.
`
)
Expand All @@ -63,15 +64,23 @@ you
it('May produce a transcript file in the `srt` format with a ms precision', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.srt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.srt'),
language: 'en',
format: 'srt'
}))).to.be.true

expect(await transcript.read()).to.equals(
`1
00:00:02,480 --> 00:00:02,500
you
00:00:00,460 --> 00:00:02,080
December 1965.
2
00:00:03,700 --> 00:00:08,800
Is that all it has been since I inherited the world only three years?
3
00:00:10,420 --> 00:00:11,900
Seems like a hundred million.
`
)
Expand All @@ -80,12 +89,14 @@ you
it('May produce a transcript file in `txt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.txt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.txt'),
language: 'en',
format: 'txt'
}))).to.be.true

expect(await transcript.read()).to.equals(`you
expect(await transcript.read()).to.equals(`December 1965.
Is that all it has been since I inherited the world only three years?
Seems like a hundred million.
`)
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ config.truncateThreshold = 0

describe('Whisper CTranslate2 transcriber', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
const transcriber = new Ctranslate2Transcriber(
{
Expand All @@ -37,12 +37,12 @@ describe('Whisper CTranslate2 transcriber', function () {

it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.vtt'), language: 'en' }))).to.be.true
expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'the_last_man_on_earth.vtt'), language: 'en' }))).to.be.true
expect(await readFile(transcript.path, 'utf8')).to.equal(
`WEBVTT
00:00.000 --> 00:02.000
You
00:00.000 --> 00:12.000
December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.
`
)
Expand All @@ -51,15 +51,15 @@ You
it('May produce a transcript file in the `srt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.srt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.srt'),
format: 'srt',
language: 'en'
}))).to.be.true

expect(await readFile(transcript.path, 'utf8')).to.equal(
`1
00:00:00,000 --> 00:00:02,000
You
00:00:00,000 --> 00:00:12,000
December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.
`
)
Expand All @@ -68,12 +68,12 @@ You
it('May produce a transcript file in the `txt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.txt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.txt'),
format: 'txt',
language: 'en'
}))).to.be.true

expect(await transcript.read()).to.equal(`You
expect(await transcript.read()).to.equal(`December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.
`)
})

Expand All @@ -86,12 +86,12 @@ You
format: 'txt'
})
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.txt'),
path: join(transcriptDirectory, 'the_last_man_on_earth.txt'),
format: 'txt',
language: 'en'
}))).to.be.true

expect(await transcript.read()).to.equal(`You
expect(await transcript.read()).to.equal(`December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.
`)
})

Expand Down

0 comments on commit 70f68b9

Please sign in to comment.