From f76ca7287215c214765af75dd95956459eba54a7 Mon Sep 17 00:00:00 2001 From: Heesu Suh Date: Wed, 30 Oct 2024 22:01:39 +0900 Subject: [PATCH] parse youtube transcript (#68) * parse youtube transcript * remove console --- src/utils/promptGenerator.ts | 12 ++ src/utils/youtube-transcript.ts | 198 ++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 src/utils/youtube-transcript.ts diff --git a/src/utils/promptGenerator.ts b/src/utils/promptGenerator.ts index 88e86cd..dbfe1ce 100644 --- a/src/utils/promptGenerator.ts +++ b/src/utils/promptGenerator.ts @@ -21,6 +21,7 @@ import { } from './obsidian' import { RAGEngine } from './ragEngine' import { tokenCount } from './token' +import { YoutubeTranscript, isYoutubeUrl } from './youtube-transcript' export class PromptGenerator { private ragEngine: RAGEngine @@ -373,6 +374,17 @@ When writing out new markdown blocks, remember not to include "line_number|" at * ... */ private async getWebsiteContent(url: string): Promise { + if (isYoutubeUrl(url)) { + // TODO: pass language based on user preferences + const { title, transcript } = + await YoutubeTranscript.fetchTranscriptAndMetadata(url) + + return `Title: ${title} +Transcript: +${transcript.map((t) => `${t.offset}: ${t.text}`).join('\n')} +` + } + const response = await requestUrl({ url }) const turndown = new TurndownService() diff --git a/src/utils/youtube-transcript.ts b/src/utils/youtube-transcript.ts new file mode 100644 index 0000000..7267d64 --- /dev/null +++ b/src/utils/youtube-transcript.ts @@ -0,0 +1,198 @@ +/** + * This source code is licensed under the MIT license. + * Original source: https://github.com/Kakulukian/youtube-transcript + * + * Modified from the original code + */ + +import { requestUrl } from 'obsidian' + +const RE_YOUTUBE = + /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i +const USER_AGENT = + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)' +const RE_XML_TRANSCRIPT = /([^<]*)<\/text>/g + +export function isYoutubeUrl(url: string) { + return RE_YOUTUBE.test(url) +} + +export class YoutubeTranscriptError extends Error { + constructor(message: string) { + super(`[YoutubeTranscript] 🚨 ${message}`) + } +} + +export class YoutubeTranscriptTooManyRequestError extends YoutubeTranscriptError { + constructor() { + super( + 'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue', + ) + } +} + +export class YoutubeTranscriptVideoUnavailableError extends YoutubeTranscriptError { + constructor(videoId: string) { + super(`The video is no longer available (${videoId})`) + } +} + +export class YoutubeTranscriptDisabledError extends YoutubeTranscriptError { + constructor(videoId: string) { + super(`Transcript is disabled on this video (${videoId})`) + } +} + +export class YoutubeTranscriptNotAvailableError extends YoutubeTranscriptError { + constructor(videoId: string) { + super(`No transcripts are available for this video (${videoId})`) + } +} + +export class YoutubeTranscriptNotAvailableLanguageError extends YoutubeTranscriptError { + constructor(lang: string, availableLangs: string[], videoId: string) { + super( + `No transcripts are available in ${lang} this video (${videoId}). Available languages: ${availableLangs.join( + ', ', + )}`, + ) + } +} + +export type TranscriptConfig = { + lang?: string +} +export type Transcript = { + text: string + duration: number + offset: number + lang?: string +} + +export type TranscriptAndMetadataResponse = { + title: string + transcript: Transcript[] +} + +/** + * Class to retrieve transcript if exist + */ +export class YoutubeTranscript { + /** + * Fetch transcript from YTB Video + * @param videoId Video url or video identifier + * @param config Get transcript in a specific language ISO + */ + public static async fetchTranscriptAndMetadata( + videoId: string, + config?: TranscriptConfig, + ): Promise { + const identifier = this.retrieveVideoId(videoId) + const videoPageResponse = await requestUrl({ + url: `https://www.youtube.com/watch?v=${identifier}`, + headers: { + ...(config?.lang && { 'Accept-Language': config.lang }), + 'User-Agent': USER_AGENT, + }, + }) + const videoPageBody = videoPageResponse.text + + // Extract title using regex from tags + const titleMatch = videoPageBody.match(/<title>(.*?)<\/title>/) + const title = titleMatch + ? titleMatch[1].replace(' - YouTube', '').trim() + : '' + + const splittedHTML = videoPageBody.split('"captions":') + + if (splittedHTML.length <= 1) { + if (videoPageBody.includes('class="g-recaptcha"')) { + throw new YoutubeTranscriptTooManyRequestError() + } + if (!videoPageBody.includes('"playabilityStatus":')) { + throw new YoutubeTranscriptVideoUnavailableError(videoId) + } + throw new YoutubeTranscriptDisabledError(videoId) + } + + const captions = (() => { + try { + // eslint-disable-next-line @typescript-eslint/no-unsafe-return + return JSON.parse( + splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''), + ) + } catch (e) { + return undefined + } + })()?.playerCaptionsTracklistRenderer + + if (!captions) { + throw new YoutubeTranscriptDisabledError(videoId) + } + + if (!('captionTracks' in captions)) { + throw new YoutubeTranscriptNotAvailableError(videoId) + } + + if ( + config?.lang && + !captions.captionTracks.some( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (track: any) => track.languageCode === config?.lang, + ) + ) { + throw new YoutubeTranscriptNotAvailableLanguageError( + config?.lang, + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument, @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-return + captions.captionTracks.map((track: any) => track.languageCode), + videoId, + ) + } + + const transcriptURL: string = ( + config?.lang + ? captions.captionTracks.find( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (track: any) => track.languageCode === config?.lang, + ) + : captions.captionTracks[0] + ).baseUrl + + const transcriptResponse = await requestUrl({ + url: transcriptURL, + headers: { + ...(config?.lang && { 'Accept-Language': config.lang }), + 'User-Agent': USER_AGENT, + }, + }) + if (transcriptResponse.status !== 200) { + throw new YoutubeTranscriptNotAvailableError(videoId) + } + const transcriptBody = transcriptResponse.text + const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)] + return { + title, + transcript: results.map((result) => ({ + text: result[3], + duration: parseFloat(result[2]), + offset: parseFloat(result[1]), + lang: config?.lang ?? captions.captionTracks[0].languageCode, + })), + } + } + + /** + * Retrieve video id from url or string + * @param videoId video url or video id + */ + private static retrieveVideoId(videoId: string) { + if (videoId.length === 11) { + return videoId + } + const matchId = videoId.match(RE_YOUTUBE) + if (matchId?.length) { + return matchId[1] + } + throw new YoutubeTranscriptError('Impossible to retrieve Youtube video ID.') + } +}