From d92af4469ca0d933f108cdb52cace162c8495ae0 Mon Sep 17 00:00:00 2001 From: Yohan Lasorsa Date: Thu, 9 Jan 2025 05:59:17 +0100 Subject: [PATCH] fix(community): update YoutubeLoader implementation (#7477) Co-authored-by: jacoblee93 --- deno.json | 3 +- .../document_loaders/web_loaders/youtube.mdx | 7 +-- libs/langchain-community/package.json | 9 +-- .../tests/youtube.int.test.ts | 21 +++++++ .../src/document_loaders/web/youtube.ts | 31 +++++----- yarn.lock | 59 ++++++------------- 6 files changed, 63 insertions(+), 67 deletions(-) create mode 100644 libs/langchain-community/src/document_loaders/tests/youtube.int.test.ts diff --git a/deno.json b/deno.json index 4c6004fa02b2..e3054ca67afb 100644 --- a/deno.json +++ b/deno.json @@ -28,7 +28,6 @@ "readline": "https://deno.land/x/readline@v1.1.0/mod.ts", "uuid": "npm:/uuid", "youtubei.js": "npm:/youtubei.js", - "youtube-transcript": "npm:/youtube-transcript", "neo4j-driver": "npm:/neo4j-driver", "axios": "npm:/axios", "@mendable/firecrawl-js": "npm:/@mendable/firecrawl-js", @@ -40,4 +39,4 @@ "@smithy/util-utf8": "npm:/@smithy/util-utf8", "@aws-sdk/types": "npm:/@aws-sdk/types" } -} \ No newline at end of file +} diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/youtube.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/youtube.mdx index 9c65c0164d24..79fb16193d38 100644 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/youtube.mdx +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/youtube.mdx @@ -4,15 +4,14 @@ hide_table_of_contents: true # YouTube transcripts -This covers how to load youtube transcript into LangChain documents. +This covers how to load YouTube transcripts into LangChain documents. ## Setup -You'll need to install the [youtube-transcript](https://www.npmjs.com/package/youtube-transcript) package -and [youtubei.js](https://www.npmjs.com/package/youtubei.js) to extract metadata: +You'll need to install the [youtubei.js](https://www.npmjs.com/package/youtubei.js) to extract metadata: ```bash npm2yarn -npm install @langchain/community @langchain/core youtube-transcript youtubei.js +npm install @langchain/community @langchain/core youtubei.js ``` ## Usage diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index eaea340ac50d..b24354aaaaee 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -219,8 +219,7 @@ "weaviate-ts-client": "^1.4.0", "web-auth-library": "^1.0.3", "word-extractor": "^1.0.4", - "youtube-transcript": "^1.0.6", - "youtubei.js": "^9.1.0" + "youtubei.js": "^12.2.0" }, "peerDependencies": { "@arcjet/redact": "^v1.0.0-alpha.23", @@ -348,8 +347,7 @@ "web-auth-library": "^1.0.3", "word-extractor": "*", "ws": "^8.14.2", - "youtube-transcript": "^1.0.6", - "youtubei.js": "^9.1.0" + "youtubei.js": "*" }, "peerDependenciesMeta": { "@arcjet/redact": { @@ -712,9 +710,6 @@ "ws": { "optional": true }, - "youtube-transcript": { - "optional": true - }, "youtubei.js": { "optional": true } diff --git a/libs/langchain-community/src/document_loaders/tests/youtube.int.test.ts b/libs/langchain-community/src/document_loaders/tests/youtube.int.test.ts new file mode 100644 index 000000000000..48d462a8a7d8 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/youtube.int.test.ts @@ -0,0 +1,21 @@ +import { test, expect } from "@jest/globals"; +import { YoutubeLoader } from "../web/youtube.js"; + +test("Test Youtube loader", async () => { + const videoUrl = "https://www.youtube.com/watch?v=FZhbJZEgKQ4"; + const loader = YoutubeLoader.createFromUrl(videoUrl, { + language: "en", + addVideoInfo: true, + }); + const docs = await loader.load(); + + expect(docs.length).toBe(1); + expect(docs[0].pageContent).toContain( + "One year ago, at the dawn of a new age," + ); + expect(docs[0].metadata).toMatchObject({ + author: "Microsoft", + source: "FZhbJZEgKQ4", + title: "Full Keynote: Satya Nadella at Microsoft Ignite 2023", + }); +}); diff --git a/libs/langchain-community/src/document_loaders/web/youtube.ts b/libs/langchain-community/src/document_loaders/web/youtube.ts index f9ce18abc040..d66b86e13592 100644 --- a/libs/langchain-community/src/document_loaders/web/youtube.ts +++ b/libs/langchain-community/src/document_loaders/web/youtube.ts @@ -1,4 +1,3 @@ -import { TranscriptResponse, YoutubeTranscript } from "youtube-transcript"; import { Innertube } from "youtubei.js"; import { Document } from "@langchain/core/documents"; import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; @@ -28,8 +27,7 @@ interface VideoMetadata { /** * A document loader for loading data from YouTube videos. It uses the - * youtube-transcript and youtubei.js libraries to fetch the transcript - * and video metadata. + * youtubei.js library to fetch the transcript and video metadata. * @example * ```typescript * const loader = new YoutubeLoader( @@ -87,29 +85,34 @@ export class YoutubeLoader extends BaseDocumentLoader { /** * Loads the transcript and video metadata from the specified YouTube - * video. It uses the youtube-transcript library to fetch the transcript - * and the youtubei.js library to fetch the video metadata. + * video. It uses the youtubei.js library to fetch the video metadata and transcripts. * @returns An array of Documents representing the retrieved data. */ async load(): Promise { - let transcript: TranscriptResponse[] | undefined; + let transcript: string | undefined; const metadata: VideoMetadata = { source: this.videoId, }; try { - transcript = await YoutubeTranscript.fetchTranscript(this.videoId, { + const youtube = await Innertube.create({ lang: this.language, + retrieve_player: false, }); + const info = await youtube.getInfo(this.videoId); + const transcriptData = await info.getTranscript(); + transcript = + transcriptData.transcript.content?.body?.initial_segments + .map((segment) => segment.snippet.text) + .join(" ") ?? ""; if (transcript === undefined) { throw new Error("Transcription not found"); } if (this.addVideoInfo) { - const youtube = await Innertube.create(); - const info = (await youtube.getBasicInfo(this.videoId)).basic_info; - metadata.description = info.short_description; - metadata.title = info.title; - metadata.view_count = info.view_count; - metadata.author = info.author; + const basicInfo = info.basic_info; + metadata.description = basicInfo.short_description; + metadata.title = basicInfo.title; + metadata.view_count = basicInfo.view_count; + metadata.author = basicInfo.author; } } catch (e: unknown) { throw new Error( @@ -117,7 +120,7 @@ export class YoutubeLoader extends BaseDocumentLoader { ); } const document = new Document({ - pageContent: transcript.map((item) => item.text).join(" "), + pageContent: transcript, metadata, }); diff --git a/yarn.lock b/yarn.lock index 2bb9ce4b777c..3ea9be9a3510 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8691,6 +8691,13 @@ __metadata: languageName: node linkType: hard +"@bufbuild/protobuf@npm:^2.0.0": + version: 2.2.3 + resolution: "@bufbuild/protobuf@npm:2.2.3" + checksum: 567ca0497669a8944fe84a9fdfa236e4a91d5879190c0ec0c8727d5220cbc21a85d06a114ac1eb35387fc5cb1dcbb7adc583c4d4f6a2ecb34fbe61dcaa7e7e9b + languageName: node + linkType: hard + "@cerebras/cerebras_cloud_sdk@npm:^1.15.0": version: 1.15.0 resolution: "@cerebras/cerebras_cloud_sdk@npm:1.15.0" @@ -12040,8 +12047,7 @@ __metadata: weaviate-ts-client: ^1.4.0 web-auth-library: ^1.0.3 word-extractor: ^1.0.4 - youtube-transcript: ^1.0.6 - youtubei.js: ^9.1.0 + youtubei.js: ^12.2.0 zod: ^3.22.3 zod-to-json-schema: ^3.22.5 peerDependencies: @@ -12170,8 +12176,7 @@ __metadata: web-auth-library: ^1.0.3 word-extractor: "*" ws: ^8.14.2 - youtube-transcript: ^1.0.6 - youtubei.js: ^9.1.0 + youtubei.js: "*" peerDependenciesMeta: "@arcjet/redact": optional: true @@ -12413,8 +12418,6 @@ __metadata: optional: true ws: optional: true - youtube-transcript: - optional: true youtubei.js: optional: true languageName: unknown @@ -23126,13 +23129,6 @@ __metadata: languageName: node linkType: hard -"centra@npm:^2.6.0": - version: 2.6.0 - resolution: "centra@npm:2.6.0" - checksum: 3b4d44762bceb9e20f7e45d01ffb9e462523cf8a0186f6710c08863f0455bceabfbcb754d6b01ea095c3bdee09c4ebef912669dc2b391a9af400e9ba7e398bc5 - languageName: node - linkType: hard - "chalk@npm:5.2.0, chalk@npm:^5.0.0, chalk@npm:^5.2.0": version: 5.2.0 resolution: "chalk@npm:5.2.0" @@ -32880,12 +32876,12 @@ __metadata: languageName: node linkType: hard -"jintr@npm:^1.1.0": - version: 1.1.0 - resolution: "jintr@npm:1.1.0" +"jintr@npm:^3.2.0": + version: 3.2.0 + resolution: "jintr@npm:3.2.0" dependencies: acorn: ^8.8.0 - checksum: b61269ff80a46c71e837e893a4754fc2d0a941e3d577dc6307f0e67cebebf81e66f646c86bf6159fe7d851d829595d7a9e9e26392b9ede7b6b39d9664f1d090d + checksum: 8f526719fd77d6f7cd52c47c06c86573cb37a15e22ce8129a228ff605d7ea3d662d7c8ef37cad7b4df767f53ca11418ffa49ad4aa8776f62d94362aba8317ff3 languageName: node linkType: hard @@ -37265,15 +37261,6 @@ __metadata: languageName: node linkType: hard -"phin@npm:^3.5.0": - version: 3.7.0 - resolution: "phin@npm:3.7.0" - dependencies: - centra: ^2.6.0 - checksum: b0a35e943615c40a3ccd7d6a2dd062568258e6b36dceed3150d13d28cad906e9028e756ad6efe66963b43937879e8a3593f986d17aac968d42982b4e8702e539 - languageName: node - linkType: hard - "pickleparser@npm:^0.2.1": version: 0.2.1 resolution: "pickleparser@npm:0.2.1" @@ -44833,23 +44820,15 @@ __metadata: languageName: node linkType: hard -"youtube-transcript@npm:^1.0.6": - version: 1.0.6 - resolution: "youtube-transcript@npm:1.0.6" - dependencies: - phin: ^3.5.0 - checksum: 7ca6a608834d2eb43d2d353ad58bb3fa86663e2f5730146a768c5c3ac423911680451a38c57f827aa7af8fb7df78a4ce3702019d988d87d9ed266f9d81aeb833 - languageName: node - linkType: hard - -"youtubei.js@npm:^9.1.0": - version: 9.1.0 - resolution: "youtubei.js@npm:9.1.0" +"youtubei.js@npm:^12.2.0": + version: 12.2.0 + resolution: "youtubei.js@npm:12.2.0" dependencies: - jintr: ^1.1.0 + "@bufbuild/protobuf": ^2.0.0 + jintr: ^3.2.0 tslib: ^2.5.0 undici: ^5.19.1 - checksum: 7a537d79435c362c3d4f0e101f85edca6b34c584b9cafeee28c4214fdcdcbb6b2ebba2571175e21a984cc5d66d0fe673d761f400dd232ecb16803bce878cb41d + checksum: 4c89a019c6b94363328e8d0d35b8d8266de1ee3db963a39b655bdaa15e4d899a107876ead53b7a1268837b9a756fecaf53be0b399545a7fe290c6da303010c8f languageName: node linkType: hard