Skip to content

Commit

Permalink
parse youtube transcript (#68)
Browse files Browse the repository at this point in the history
* parse youtube transcript

* remove console
  • Loading branch information
glowingjade authored Oct 30, 2024
1 parent c238565 commit f76ca72
Show file tree
Hide file tree
Showing 2 changed files with 210 additions and 0 deletions.
12 changes: 12 additions & 0 deletions src/utils/promptGenerator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import {
} from './obsidian'
import { RAGEngine } from './ragEngine'
import { tokenCount } from './token'
import { YoutubeTranscript, isYoutubeUrl } from './youtube-transcript'

export class PromptGenerator {
private ragEngine: RAGEngine
Expand Down Expand Up @@ -373,6 +374,17 @@ When writing out new markdown blocks, remember not to include "line_number|" at
* ...
*/
private async getWebsiteContent(url: string): Promise<string> {
if (isYoutubeUrl(url)) {
// TODO: pass language based on user preferences
const { title, transcript } =
await YoutubeTranscript.fetchTranscriptAndMetadata(url)

return `Title: ${title}
Transcript:
${transcript.map((t) => `${t.offset}: ${t.text}`).join('\n')}
`
}

const response = await requestUrl({ url })

const turndown = new TurndownService()
Expand Down
198 changes: 198 additions & 0 deletions src/utils/youtube-transcript.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
/**
* This source code is licensed under the MIT license.
* Original source: https://github.com/Kakulukian/youtube-transcript
*
* Modified from the original code
*/

import { requestUrl } from 'obsidian'

const RE_YOUTUBE =
/(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i
const USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)'
const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g

export function isYoutubeUrl(url: string) {
return RE_YOUTUBE.test(url)
}

export class YoutubeTranscriptError extends Error {
constructor(message: string) {
super(`[YoutubeTranscript] 🚨 ${message}`)
}
}

export class YoutubeTranscriptTooManyRequestError extends YoutubeTranscriptError {
constructor() {
super(
'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue',
)
}
}

export class YoutubeTranscriptVideoUnavailableError extends YoutubeTranscriptError {
constructor(videoId: string) {
super(`The video is no longer available (${videoId})`)
}
}

export class YoutubeTranscriptDisabledError extends YoutubeTranscriptError {
constructor(videoId: string) {
super(`Transcript is disabled on this video (${videoId})`)
}
}

export class YoutubeTranscriptNotAvailableError extends YoutubeTranscriptError {
constructor(videoId: string) {
super(`No transcripts are available for this video (${videoId})`)
}
}

export class YoutubeTranscriptNotAvailableLanguageError extends YoutubeTranscriptError {
constructor(lang: string, availableLangs: string[], videoId: string) {
super(
`No transcripts are available in ${lang} this video (${videoId}). Available languages: ${availableLangs.join(
', ',
)}`,
)
}
}

export type TranscriptConfig = {
lang?: string
}
export type Transcript = {
text: string
duration: number
offset: number
lang?: string
}

export type TranscriptAndMetadataResponse = {
title: string
transcript: Transcript[]
}

/**
* Class to retrieve transcript if exist
*/
export class YoutubeTranscript {
/**
* Fetch transcript from YTB Video
* @param videoId Video url or video identifier
* @param config Get transcript in a specific language ISO
*/
public static async fetchTranscriptAndMetadata(
videoId: string,
config?: TranscriptConfig,
): Promise<TranscriptAndMetadataResponse> {
const identifier = this.retrieveVideoId(videoId)
const videoPageResponse = await requestUrl({
url: `https://www.youtube.com/watch?v=${identifier}`,
headers: {
...(config?.lang && { 'Accept-Language': config.lang }),
'User-Agent': USER_AGENT,
},
})
const videoPageBody = videoPageResponse.text

// Extract title using regex from <title> tags
const titleMatch = videoPageBody.match(/<title>(.*?)<\/title>/)
const title = titleMatch
? titleMatch[1].replace(' - YouTube', '').trim()
: ''

const splittedHTML = videoPageBody.split('"captions":')

if (splittedHTML.length <= 1) {
if (videoPageBody.includes('class="g-recaptcha"')) {
throw new YoutubeTranscriptTooManyRequestError()
}
if (!videoPageBody.includes('"playabilityStatus":')) {
throw new YoutubeTranscriptVideoUnavailableError(videoId)
}
throw new YoutubeTranscriptDisabledError(videoId)
}

const captions = (() => {
try {
// eslint-disable-next-line @typescript-eslint/no-unsafe-return
return JSON.parse(
splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''),
)
} catch (e) {
return undefined
}
})()?.playerCaptionsTracklistRenderer

if (!captions) {
throw new YoutubeTranscriptDisabledError(videoId)
}

if (!('captionTracks' in captions)) {
throw new YoutubeTranscriptNotAvailableError(videoId)
}

if (
config?.lang &&
!captions.captionTracks.some(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(track: any) => track.languageCode === config?.lang,
)
) {
throw new YoutubeTranscriptNotAvailableLanguageError(
config?.lang,
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument, @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-return
captions.captionTracks.map((track: any) => track.languageCode),
videoId,
)
}

const transcriptURL: string = (
config?.lang
? captions.captionTracks.find(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(track: any) => track.languageCode === config?.lang,
)
: captions.captionTracks[0]
).baseUrl

const transcriptResponse = await requestUrl({
url: transcriptURL,
headers: {
...(config?.lang && { 'Accept-Language': config.lang }),
'User-Agent': USER_AGENT,
},
})
if (transcriptResponse.status !== 200) {
throw new YoutubeTranscriptNotAvailableError(videoId)
}
const transcriptBody = transcriptResponse.text
const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)]
return {
title,
transcript: results.map((result) => ({
text: result[3],
duration: parseFloat(result[2]),
offset: parseFloat(result[1]),
lang: config?.lang ?? captions.captionTracks[0].languageCode,
})),
}
}

/**
* Retrieve video id from url or string
* @param videoId video url or video id
*/
private static retrieveVideoId(videoId: string) {
if (videoId.length === 11) {
return videoId
}
const matchId = videoId.match(RE_YOUTUBE)
if (matchId?.length) {
return matchId[1]
}
throw new YoutubeTranscriptError('Impossible to retrieve Youtube video ID.')
}
}

0 comments on commit f76ca72

Please sign in to comment.