parse youtube transcript (#68)

* parse youtube transcript * remove console
glowingjade · Oct 30, 2024 · f76ca72 · f76ca72
1 parent c238565
commit f76ca72
Show file tree

Hide file tree

Showing 2 changed files with 210 additions and 0 deletions.
diff --git a/src/utils/promptGenerator.ts b/src/utils/promptGenerator.ts
@@ -21,6 +21,7 @@ import {
 } from './obsidian'
 import { RAGEngine } from './ragEngine'
 import { tokenCount } from './token'
+import { YoutubeTranscript, isYoutubeUrl } from './youtube-transcript'
 
 export class PromptGenerator {
   private ragEngine: RAGEngine
@@ -373,6 +374,17 @@ When writing out new markdown blocks, remember not to include "line_number|" at
    * ...
    */
   private async getWebsiteContent(url: string): Promise<string> {
+    if (isYoutubeUrl(url)) {
+      // TODO: pass language based on user preferences
+      const { title, transcript } =
+        await YoutubeTranscript.fetchTranscriptAndMetadata(url)
+
+      return `Title: ${title}
+Transcript:
+${transcript.map((t) => `${t.offset}: ${t.text}`).join('\n')}
+`
+    }
+
     const response = await requestUrl({ url })
 
     const turndown = new TurndownService()

diff --git a/src/utils/youtube-transcript.ts b/src/utils/youtube-transcript.ts
@@ -0,0 +1,198 @@
+/**
+ * This source code is licensed under the MIT license.
+ * Original source: https://github.com/Kakulukian/youtube-transcript
+ *
+ * Modified from the original code
+ */
+
+import { requestUrl } from 'obsidian'
+
+const RE_YOUTUBE =
+  /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i
+const USER_AGENT =
+  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)'
+const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g
+
+export function isYoutubeUrl(url: string) {
+  return RE_YOUTUBE.test(url)
+}
+
+export class YoutubeTranscriptError extends Error {
+  constructor(message: string) {
+    super(`[YoutubeTranscript] 🚨 ${message}`)
+  }
+}
+
+export class YoutubeTranscriptTooManyRequestError extends YoutubeTranscriptError {
+  constructor() {
+    super(
+      'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue',
+    )
+  }
+}
+
+export class YoutubeTranscriptVideoUnavailableError extends YoutubeTranscriptError {
+  constructor(videoId: string) {
+    super(`The video is no longer available (${videoId})`)
+  }
+}
+
+export class YoutubeTranscriptDisabledError extends YoutubeTranscriptError {
+  constructor(videoId: string) {
+    super(`Transcript is disabled on this video (${videoId})`)
+  }
+}
+
+export class YoutubeTranscriptNotAvailableError extends YoutubeTranscriptError {
+  constructor(videoId: string) {
+    super(`No transcripts are available for this video (${videoId})`)
+  }
+}
+
+export class YoutubeTranscriptNotAvailableLanguageError extends YoutubeTranscriptError {
+  constructor(lang: string, availableLangs: string[], videoId: string) {
+    super(
+      `No transcripts are available in ${lang} this video (${videoId}). Available languages: ${availableLangs.join(
+        ', ',
+      )}`,
+    )
+  }
+}
+
+export type TranscriptConfig = {
+  lang?: string
+}
+export type Transcript = {
+  text: string
+  duration: number
+  offset: number
+  lang?: string
+}
+
+export type TranscriptAndMetadataResponse = {
+  title: string
+  transcript: Transcript[]
+}
+
+/**
+ * Class to retrieve transcript if exist
+ */
+export class YoutubeTranscript {
+  /**
+   * Fetch transcript from YTB Video
+   * @param videoId Video url or video identifier
+   * @param config Get transcript in a specific language ISO
+   */
+  public static async fetchTranscriptAndMetadata(
+    videoId: string,
+    config?: TranscriptConfig,
+  ): Promise<TranscriptAndMetadataResponse> {
+    const identifier = this.retrieveVideoId(videoId)
+    const videoPageResponse = await requestUrl({
+      url: `https://www.youtube.com/watch?v=${identifier}`,
+      headers: {
+        ...(config?.lang && { 'Accept-Language': config.lang }),
+        'User-Agent': USER_AGENT,
+      },
+    })
+    const videoPageBody = videoPageResponse.text
+
+    // Extract title using regex from <title> tags
+    const titleMatch = videoPageBody.match(/<title>(.*?)<\/title>/)
+    const title = titleMatch
+      ? titleMatch[1].replace(' - YouTube', '').trim()
+      : ''
+
+    const splittedHTML = videoPageBody.split('"captions":')
+
+    if (splittedHTML.length <= 1) {
+      if (videoPageBody.includes('class="g-recaptcha"')) {
+        throw new YoutubeTranscriptTooManyRequestError()
+      }
+      if (!videoPageBody.includes('"playabilityStatus":')) {
+        throw new YoutubeTranscriptVideoUnavailableError(videoId)
+      }
+      throw new YoutubeTranscriptDisabledError(videoId)
+    }
+
+    const captions = (() => {
+      try {
+        // eslint-disable-next-line @typescript-eslint/no-unsafe-return
+        return JSON.parse(
+          splittedHTML[1].split(',"videoDetails')[0].replace('\n', ''),
+        )
+      } catch (e) {
+        return undefined
+      }
+    })()?.playerCaptionsTracklistRenderer
+
+    if (!captions) {
+      throw new YoutubeTranscriptDisabledError(videoId)
+    }
+
+    if (!('captionTracks' in captions)) {
+      throw new YoutubeTranscriptNotAvailableError(videoId)
+    }
+
+    if (
+      config?.lang &&
+      !captions.captionTracks.some(
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        (track: any) => track.languageCode === config?.lang,
+      )
+    ) {
+      throw new YoutubeTranscriptNotAvailableLanguageError(
+        config?.lang,
+        // eslint-disable-next-line @typescript-eslint/no-unsafe-argument, @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-return
+        captions.captionTracks.map((track: any) => track.languageCode),
+        videoId,
+      )
+    }
+
+    const transcriptURL: string = (
+      config?.lang
+        ? captions.captionTracks.find(
+            // eslint-disable-next-line @typescript-eslint/no-explicit-any
+            (track: any) => track.languageCode === config?.lang,
+          )
+        : captions.captionTracks[0]
+    ).baseUrl
+
+    const transcriptResponse = await requestUrl({
+      url: transcriptURL,
+      headers: {
+        ...(config?.lang && { 'Accept-Language': config.lang }),
+        'User-Agent': USER_AGENT,
+      },
+    })
+    if (transcriptResponse.status !== 200) {
+      throw new YoutubeTranscriptNotAvailableError(videoId)
+    }
+    const transcriptBody = transcriptResponse.text
+    const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)]
+    return {
+      title,
+      transcript: results.map((result) => ({
+        text: result[3],
+        duration: parseFloat(result[2]),
+        offset: parseFloat(result[1]),
+        lang: config?.lang ?? captions.captionTracks[0].languageCode,
+      })),
+    }
+  }
+
+  /**
+   * Retrieve video id from url or string
+   * @param videoId video url or video id
+   */
+  private static retrieveVideoId(videoId: string) {
+    if (videoId.length === 11) {
+      return videoId
+    }
+    const matchId = videoId.match(RE_YOUTUBE)
+    if (matchId?.length) {
+      return matchId[1]
+    }
+    throw new YoutubeTranscriptError('Impossible to retrieve Youtube video ID.')
+  }
+}