From a087e036be1286a7bbf10b72aaaf6b261b70fa3f Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 3 May 2024 08:39:51 +0200 Subject: [PATCH] Add support for transcription handling (#1119) * Add support for transcription handling * Create honest-vans-tan.md * emit event for each segment * Emit segement array and also emit on publication * export TranscriptionSegment type * Add track sync time updates * make import more explicit * emit events also for local participant * mark transcription events as beta * update protocol --- .changeset/honest-vans-tan.md | 5 +++++ package.json | 2 +- pnpm-lock.yaml | 25 ++++++++++--------------- src/index.ts | 2 +- src/room/RTCEngine.ts | 4 ++++ src/room/Room.ts | 28 +++++++++++++++++++++++++++- src/room/events.ts | 23 +++++++++++++++++++++++ src/room/participant/Participant.ts | 6 +++++- src/room/track/RemoteTrack.ts | 13 +++++++++++++ src/room/track/Track.ts | 9 +++++++++ src/room/track/TrackPublication.ts | 4 +++- src/room/types.ts | 9 +++++++++ src/room/utils.ts | 19 +++++++++++++++++-- 13 files changed, 127 insertions(+), 22 deletions(-) create mode 100644 .changeset/honest-vans-tan.md diff --git a/.changeset/honest-vans-tan.md b/.changeset/honest-vans-tan.md new file mode 100644 index 0000000000..e399e9abe7 --- /dev/null +++ b/.changeset/honest-vans-tan.md @@ -0,0 +1,5 @@ +--- +"livekit-client": patch +--- + +Add support for transcription handling diff --git a/package.json b/package.json index efd3b39628..d22e1374bb 100644 --- a/package.json +++ b/package.json @@ -53,7 +53,7 @@ "size-limit": "size-limit" }, "dependencies": { - "@livekit/protocol": "1.14.0", + "@livekit/protocol": "1.15.0", "events": "^3.3.0", "loglevel": "^1.8.0", "sdp-transform": "^2.14.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 85385511a6..cf0b4128d8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,8 +9,8 @@ importers: .: dependencies: '@livekit/protocol': - specifier: 1.14.0 - version: 1.14.0 + specifier: 1.15.0 + version: 1.15.0 events: specifier: ^3.3.0 version: 3.3.0 @@ -788,9 +788,6 @@ packages: resolution: {integrity: sha512-6mQNsaLeXTw0nxYUYu+NSa4Hx4BlF1x1x8/PMFbiR+GBSr+2DkECc69b8hgy2frEodNcvPffeH8YfWd3LI6jhQ==} engines: {node: '>=6.9.0'} - '@bufbuild/protobuf@1.8.0': - resolution: {integrity: sha512-qR9FwI8QKIveDnUYutvfzbC21UZJJryYrLuZGjeZ/VGz+vXelUkK+xgkOHsvPEdYEdxtgUUq4313N8QtOehJ1Q==} - '@bufbuild/protobuf@1.9.0': resolution: {integrity: sha512-W7gp8Q/v1NlCZLsv8pQ3Y0uCu/SHgXOVFK+eUluUKWXmsb6VHkpNx0apdOWWcDbB9sJoKeP8uPrjmehJz6xETQ==} @@ -1060,8 +1057,8 @@ packages: '@livekit/changesets-changelog-github@0.0.4': resolution: {integrity: sha512-MXaiLYwgkYciZb8G2wkVtZ1pJJzZmVx5cM30Q+ClslrIYyAqQhRbPmZDM79/5CGxb1MTemR/tfOM25tgJgAK0g==} - '@livekit/protocol@1.14.0': - resolution: {integrity: sha512-Fd/xVht+ew+rEFu54Rsq8MNqCQV/ZcJFG0WU264p5yL24Ory+AV9hXt/4MUzWXvPXTLIxovJdJJynYKTBJO6qw==} + '@livekit/protocol@1.15.0': + resolution: {integrity: sha512-KVjM1odPzEWkXB4QQyz2qfiecHo74wLeb8YDhaX/cNnxUk0H1L35H0gVgLA8Y6fXkF3ayOHQnXO9V3GzulMyXw==} '@manypkg/find-root@1.1.0': resolution: {integrity: sha512-mki5uBvhHzO8kYYix/WRy2WX8S3B5wdVSc9D6KcU5lQNglP2yt58/VfLuAK49glRXChosY8ap2oJ1qgma3GUVA==} @@ -3384,8 +3381,8 @@ packages: engines: {node: '>=14.17'} hasBin: true - typescript@5.5.0-dev.20240430: - resolution: {integrity: sha512-HDHiMzAPPZucY1VLkXpTF9qrxwqXv0h/SEDnZs59DPOPKlGlAn4c0o+1t45+kOvD2PhfzFpwHymQnmWBlNFAJA==} + typescript@5.5.0-dev.20240502: + resolution: {integrity: sha512-k2os5k45m5icuXipmT6PBBA2byI+V9WhDS5StUQbU1m9NYvivVPurukxM7otJSRfYPMgpLerkvsRM/bLAykaxg==} engines: {node: '>=14.17'} hasBin: true @@ -4456,8 +4453,6 @@ snapshots: '@babel/helper-validator-identifier': 7.24.5 to-fast-properties: 2.0.0 - '@bufbuild/protobuf@1.8.0': {} - '@bufbuild/protobuf@1.9.0': {} '@bufbuild/protoc-gen-es@1.9.0(@bufbuild/protobuf@1.9.0)': @@ -4771,9 +4766,9 @@ snapshots: transitivePeerDependencies: - encoding - '@livekit/protocol@1.14.0': + '@livekit/protocol@1.15.0': dependencies: - '@bufbuild/protobuf': 1.8.0 + '@bufbuild/protobuf': 1.9.0 '@manypkg/find-root@1.1.0': dependencies: @@ -5595,7 +5590,7 @@ snapshots: dependencies: semver: 7.6.0 shelljs: 0.8.5 - typescript: 5.5.0-dev.20240430 + typescript: 5.5.0-dev.20240502 electron-to-chromium@1.4.724: {} @@ -7275,7 +7270,7 @@ snapshots: typescript@5.4.5: {} - typescript@5.5.0-dev.20240430: {} + typescript@5.5.0-dev.20240502: {} ufo@1.5.3: {} diff --git a/src/index.ts b/src/index.ts index 5855ac514a..fce93d7779 100644 --- a/src/index.ts +++ b/src/index.ts @@ -44,7 +44,7 @@ export { facingModeFromDeviceLabel, facingModeFromLocalTrack } from './room/trac export * from './room/track/options'; export * from './room/track/processor/types'; export * from './room/track/types'; -export type { DataPublishOptions, SimulationScenario } from './room/types'; +export type { DataPublishOptions, SimulationScenario, TranscriptionSegment } from './room/types'; export * from './version'; export { ConnectionQuality, diff --git a/src/room/RTCEngine.ts b/src/room/RTCEngine.ts index e6d7e95652..334e86e392 100644 --- a/src/room/RTCEngine.ts +++ b/src/room/RTCEngine.ts @@ -23,6 +23,7 @@ import { TrackInfo, type TrackPublishedResponse, TrackUnpublishedResponse, + Transcription, UpdateSubscription, UserPacket, } from '@livekit/protocol'; @@ -634,6 +635,8 @@ export default class RTCEngine extends (EventEmitter as new () => TypedEventEmit this.emit(EngineEvent.ActiveSpeakersUpdate, dp.value.value.speakers); } else if (dp.value?.case === 'user') { this.emit(EngineEvent.DataPacketReceived, dp.value.value, dp.kind); + } else if (dp.value?.case === 'transcription') { + this.emit(EngineEvent.TranscriptionReceived, dp.value.value); } } finally { unlock(); @@ -1357,6 +1360,7 @@ export type EngineEventCallbacks = { ) => void; activeSpeakersUpdate: (speakers: Array) => void; dataPacketReceived: (userPacket: UserPacket, kind: DataPacket_Kind) => void; + transcriptionReceived: (transcription: Transcription) => void; transportsCreated: (publisher: PCTransport, subscriber: PCTransport) => void; /** @internal */ trackSenderAdded: (track: Track, sender: RTCRtpSender) => void; diff --git a/src/room/Room.ts b/src/room/Room.ts index 8ac45dd4dd..b5e5d47226 100644 --- a/src/room/Room.ts +++ b/src/room/Room.ts @@ -18,6 +18,8 @@ import { TrackInfo, TrackSource, TrackType, + Transcription as TranscriptionModel, + TranscriptionSegment as TranscriptionSegmentModel, UserPacket, protoInt64, } from '@livekit/protocol'; @@ -61,11 +63,12 @@ import type { TrackPublication } from './track/TrackPublication'; import type { TrackProcessor } from './track/processor/types'; import type { AdaptiveStreamSettings } from './track/types'; import { getNewAudioContext, sourceToKind } from './track/utils'; -import type { SimulationOptions, SimulationScenario } from './types'; +import type { SimulationOptions, SimulationScenario, TranscriptionSegment } from './types'; import { Future, Mutex, createDummyVideoStreamTrack, + extractTranscriptionSegments, getEmptyAudioStreamTrack, isBrowserSupported, isCloud, @@ -330,6 +333,7 @@ class Room extends (EventEmitter as new () => TypedEmitter) }) .on(EngineEvent.ActiveSpeakersUpdate, this.handleActiveSpeakersUpdate) .on(EngineEvent.DataPacketReceived, this.handleDataPacket) + .on(EngineEvent.TranscriptionReceived, this.handleTranscription) .on(EngineEvent.Resuming, () => { this.clearConnectionReconcile(); this.isResuming = true; @@ -1471,6 +1475,23 @@ class Room extends (EventEmitter as new () => TypedEmitter) participant?.emit(ParticipantEvent.DataReceived, userPacket.payload, kind); }; + bufferedSegments: Map = new Map(); + + private handleTranscription = (transcription: TranscriptionModel) => { + // find the participant + const participant = + transcription.participantIdentity === this.localParticipant.identity + ? this.localParticipant + : this.remoteParticipants.get(transcription.participantIdentity); + const publication = participant?.trackPublications.get(transcription.trackId); + + const segments = extractTranscriptionSegments(transcription); + + publication?.emit(TrackEvent.TranscriptionReceived, segments); + participant?.emit(ParticipantEvent.TranscriptionReceived, segments, publication); + this.emit(RoomEvent.TranscriptionReceived, segments, participant, publication); + }; + private handleAudioPlaybackStarted = () => { if (this.canPlaybackAudio) { return; @@ -2071,6 +2092,11 @@ export type RoomEventCallbacks = { kind?: DataPacket_Kind, topic?: string, ) => void; + transcriptionReceived: ( + transcription: TranscriptionSegment[], + participant?: Participant, + publication?: TrackPublication, + ) => void; connectionQualityChanged: (quality: ConnectionQuality, participant: Participant) => void; mediaDevicesError: (error: Error) => void; trackStreamStateChanged: ( diff --git a/src/room/events.ts b/src/room/events.ts index 2510e7be5d..0e9a12845e 100644 --- a/src/room/events.ts +++ b/src/room/events.ts @@ -197,6 +197,12 @@ export enum RoomEvent { */ DataReceived = 'dataReceived', + /** + * Transcription received from a participant's track. + * @beta + */ + TranscriptionReceived = 'transcriptionReceived', + /** * Connection quality was changed for a Participant. It'll receive updates * from the local participant, as well as any [[RemoteParticipant]]s that we are @@ -402,6 +408,12 @@ export enum ParticipantEvent { */ DataReceived = 'dataReceived', + /** + * Transcription received from this participant as data source. + * @beta + */ + TranscriptionReceived = 'transcriptionReceived', + /** * Has speaking status changed for the current participant * @@ -479,6 +491,7 @@ export enum EngineEvent { MediaTrackAdded = 'mediaTrackAdded', ActiveSpeakersUpdate = 'activeSpeakersUpdate', DataPacketReceived = 'dataPacketReceived', + TranscriptionReceived = 'transcriptionReceived', RTPVideoMapUpdate = 'rtpVideoMapUpdate', DCBufferStatusChanged = 'dcBufferStatusChanged', ParticipantUpdate = 'participantUpdate', @@ -562,4 +575,14 @@ export enum TrackEvent { * @internal */ AudioTrackFeatureUpdate = 'audioTrackFeatureUpdate', + + /** + * @beta + */ + TranscriptionReceived = 'transcriptionReceived', + + /** + * @experimental + */ + TimeSyncUpdate = 'timeSyncUpdate', } diff --git a/src/room/participant/Participant.ts b/src/room/participant/Participant.ts index 88d21a63b8..530f6b9ff6 100644 --- a/src/room/participant/Participant.ts +++ b/src/room/participant/Participant.ts @@ -16,7 +16,7 @@ import type RemoteTrack from '../track/RemoteTrack'; import type RemoteTrackPublication from '../track/RemoteTrackPublication'; import { Track } from '../track/Track'; import type { TrackPublication } from '../track/TrackPublication'; -import type { LoggerOptions } from '../types'; +import type { LoggerOptions, TranscriptionSegment } from '../types'; export enum ConnectionQuality { Excellent = 'excellent', @@ -329,6 +329,10 @@ export type ParticipantEventCallbacks = { participantMetadataChanged: (prevMetadata: string | undefined, participant?: any) => void; participantNameChanged: (name: string) => void; dataReceived: (payload: Uint8Array, kind: DataPacket_Kind) => void; + transcriptionReceived: ( + transcription: TranscriptionSegment[], + publication?: TrackPublication, + ) => void; isSpeakingChanged: (speaking: boolean) => void; connectionQualityChanged: (connectionQuality: ConnectionQuality) => void; trackStreamStateChanged: ( diff --git a/src/room/track/RemoteTrack.ts b/src/room/track/RemoteTrack.ts index b03595a2f6..c5720148f2 100644 --- a/src/room/track/RemoteTrack.ts +++ b/src/room/track/RemoteTrack.ts @@ -77,7 +77,20 @@ export default abstract class RemoteTrack< if (!this.monitorInterval) { this.monitorInterval = setInterval(() => this.monitorReceiver(), monitorFrequency); } + this.registerTimeSyncUpdate(); } protected abstract monitorReceiver(): void; + + registerTimeSyncUpdate() { + const loop = () => { + this.timeSyncHandle = requestAnimationFrame(() => loop()); + const newTime = this.receiver?.getSynchronizationSources()[0]?.rtpTimestamp; + if (newTime && this.rtpTimestamp !== newTime) { + this.emit(TrackEvent.TimeSyncUpdate, newTime); + this.rtpTimestamp = newTime; + } + }; + loop(); + } } diff --git a/src/room/track/Track.ts b/src/room/track/Track.ts index 535be870a7..62ff9782b5 100644 --- a/src/room/track/Track.ts +++ b/src/room/track/Track.ts @@ -53,6 +53,9 @@ export abstract class Track< */ streamState: Track.StreamState = Track.StreamState.Active; + /** @internal */ + rtpTimestamp: number | undefined; + protected _mediaStreamTrack: MediaStreamTrack; protected _mediaStreamID: string; @@ -63,6 +66,8 @@ export abstract class Track< private loggerContextCb: LoggerOptions['loggerContextCb']; + protected timeSyncHandle: number | undefined; + protected _currentBitrate: number = 0; protected monitorInterval?: ReturnType; @@ -255,6 +260,9 @@ export abstract class Track< if (this.monitorInterval) { clearInterval(this.monitorInterval); } + if (this.timeSyncHandle) { + cancelAnimationFrame(this.timeSyncHandle); + } } /** @internal */ @@ -517,4 +525,5 @@ export type TrackEventCallbacks = { upstreamResumed: (track: any) => void; trackProcessorUpdate: (processor?: TrackProcessor) => void; audioTrackFeatureUpdate: (track: any, feature: AudioTrackFeature, enabled: boolean) => void; + timeSyncUpdate: (timestamp: number) => void; }; diff --git a/src/room/track/TrackPublication.ts b/src/room/track/TrackPublication.ts index ba26218737..ca48f3310e 100644 --- a/src/room/track/TrackPublication.ts +++ b/src/room/track/TrackPublication.ts @@ -9,7 +9,7 @@ import { EventEmitter } from 'events'; import type TypedEventEmitter from 'typed-emitter'; import log, { LoggerNames, getLogger } from '../../logger'; import { TrackEvent } from '../events'; -import type { LoggerOptions } from '../types'; +import type { LoggerOptions, TranscriptionSegment } from '../types'; import LocalAudioTrack from './LocalAudioTrack'; import LocalVideoTrack from './LocalVideoTrack'; import RemoteAudioTrack from './RemoteAudioTrack'; @@ -174,4 +174,6 @@ export type PublicationEventCallbacks = { prevStatus: TrackPublication.SubscriptionStatus, ) => void; subscriptionFailed: (error: SubscriptionError) => void; + transcriptionReceived: (transcription: TranscriptionSegment[]) => void; + timeSyncUpdate: (timestamp: number) => void; }; diff --git a/src/room/types.ts b/src/room/types.ts index 04c3c8d907..32c083cd60 100644 --- a/src/room/types.ts +++ b/src/room/types.ts @@ -55,3 +55,12 @@ export type LoggerOptions = { loggerName?: string; loggerContextCb?: () => Record; }; + +export interface TranscriptionSegment { + id: string; + text: string; + language: string; + startTime: number; + endTime: number; + final: boolean; +} diff --git a/src/room/utils.ts b/src/room/utils.ts index a5f6898e5f..e2494d9245 100644 --- a/src/room/utils.ts +++ b/src/room/utils.ts @@ -1,4 +1,4 @@ -import { ClientInfo, ClientInfo_SDK } from '@livekit/protocol'; +import { ClientInfo, ClientInfo_SDK, Transcription as TranscriptionModel } from '@livekit/protocol'; import { getBrowser } from '../utils/browserParser'; import { protocolVersion, version } from '../version'; import CriticalTimers from './timers'; @@ -6,7 +6,7 @@ import type LocalAudioTrack from './track/LocalAudioTrack'; import type RemoteAudioTrack from './track/RemoteAudioTrack'; import { VideoCodec, videoCodecs } from './track/options'; import { getNewAudioContext } from './track/utils'; -import type { LiveKitReactNativeInfo } from './types'; +import type { LiveKitReactNativeInfo, TranscriptionSegment } from './types'; const separator = '|'; export const ddExtensionURI = @@ -527,3 +527,18 @@ export function toHttpUrl(url: string): string { } return url; } + +export function extractTranscriptionSegments( + transcription: TranscriptionModel, +): TranscriptionSegment[] { + return transcription.segments.map(({ id, text, language, startTime, endTime, final }) => { + return { + id, + text, + startTime: Number.parseInt(startTime.toString()), + endTime: Number.parseInt(endTime.toString()), + final, + language, + }; + }); +}