Skip to content

Commit

Permalink
fix(community): update YoutubeLoader implementation (#7477)
Browse files Browse the repository at this point in the history
Co-authored-by: jacoblee93 <[email protected]>
  • Loading branch information
sinedied and jacoblee93 authored Jan 9, 2025
1 parent a471516 commit d92af44
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 67 deletions.
3 changes: 1 addition & 2 deletions deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
"readline": "https://deno.land/x/[email protected]/mod.ts",
"uuid": "npm:/uuid",
"youtubei.js": "npm:/youtubei.js",
"youtube-transcript": "npm:/youtube-transcript",
"neo4j-driver": "npm:/neo4j-driver",
"axios": "npm:/axios",
"@mendable/firecrawl-js": "npm:/@mendable/firecrawl-js",
Expand All @@ -40,4 +39,4 @@
"@smithy/util-utf8": "npm:/@smithy/util-utf8",
"@aws-sdk/types": "npm:/@aws-sdk/types"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@ hide_table_of_contents: true

# YouTube transcripts

This covers how to load youtube transcript into LangChain documents.
This covers how to load YouTube transcripts into LangChain documents.

## Setup

You'll need to install the [youtube-transcript](https://www.npmjs.com/package/youtube-transcript) package
and [youtubei.js](https://www.npmjs.com/package/youtubei.js) to extract metadata:
You'll need to install the [youtubei.js](https://www.npmjs.com/package/youtubei.js) to extract metadata:

```bash npm2yarn
npm install @langchain/community @langchain/core youtube-transcript youtubei.js
npm install @langchain/community @langchain/core youtubei.js
```

## Usage
Expand Down
9 changes: 2 additions & 7 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,7 @@
"weaviate-ts-client": "^1.4.0",
"web-auth-library": "^1.0.3",
"word-extractor": "^1.0.4",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^9.1.0"
"youtubei.js": "^12.2.0"
},
"peerDependencies": {
"@arcjet/redact": "^v1.0.0-alpha.23",
Expand Down Expand Up @@ -348,8 +347,7 @@
"web-auth-library": "^1.0.3",
"word-extractor": "*",
"ws": "^8.14.2",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^9.1.0"
"youtubei.js": "*"
},
"peerDependenciesMeta": {
"@arcjet/redact": {
Expand Down Expand Up @@ -712,9 +710,6 @@
"ws": {
"optional": true
},
"youtube-transcript": {
"optional": true
},
"youtubei.js": {
"optional": true
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { test, expect } from "@jest/globals";
import { YoutubeLoader } from "../web/youtube.js";

test("Test Youtube loader", async () => {
const videoUrl = "https://www.youtube.com/watch?v=FZhbJZEgKQ4";
const loader = YoutubeLoader.createFromUrl(videoUrl, {
language: "en",
addVideoInfo: true,
});
const docs = await loader.load();

expect(docs.length).toBe(1);
expect(docs[0].pageContent).toContain(
"One year ago, at the dawn of a new age,"
);
expect(docs[0].metadata).toMatchObject({
author: "Microsoft",
source: "FZhbJZEgKQ4",
title: "Full Keynote: Satya Nadella at Microsoft Ignite 2023",
});
});
31 changes: 17 additions & 14 deletions libs/langchain-community/src/document_loaders/web/youtube.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { TranscriptResponse, YoutubeTranscript } from "youtube-transcript";
import { Innertube } from "youtubei.js";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
Expand Down Expand Up @@ -28,8 +27,7 @@ interface VideoMetadata {

/**
* A document loader for loading data from YouTube videos. It uses the
* youtube-transcript and youtubei.js libraries to fetch the transcript
* and video metadata.
* youtubei.js library to fetch the transcript and video metadata.
* @example
* ```typescript
* const loader = new YoutubeLoader(
Expand Down Expand Up @@ -87,37 +85,42 @@ export class YoutubeLoader extends BaseDocumentLoader {

/**
* Loads the transcript and video metadata from the specified YouTube
* video. It uses the youtube-transcript library to fetch the transcript
* and the youtubei.js library to fetch the video metadata.
* video. It uses the youtubei.js library to fetch the video metadata and transcripts.
* @returns An array of Documents representing the retrieved data.
*/
async load(): Promise<Document[]> {
let transcript: TranscriptResponse[] | undefined;
let transcript: string | undefined;
const metadata: VideoMetadata = {
source: this.videoId,
};
try {
transcript = await YoutubeTranscript.fetchTranscript(this.videoId, {
const youtube = await Innertube.create({
lang: this.language,
retrieve_player: false,
});
const info = await youtube.getInfo(this.videoId);
const transcriptData = await info.getTranscript();
transcript =
transcriptData.transcript.content?.body?.initial_segments
.map((segment) => segment.snippet.text)
.join(" ") ?? "";
if (transcript === undefined) {
throw new Error("Transcription not found");
}
if (this.addVideoInfo) {
const youtube = await Innertube.create();
const info = (await youtube.getBasicInfo(this.videoId)).basic_info;
metadata.description = info.short_description;
metadata.title = info.title;
metadata.view_count = info.view_count;
metadata.author = info.author;
const basicInfo = info.basic_info;
metadata.description = basicInfo.short_description;
metadata.title = basicInfo.title;
metadata.view_count = basicInfo.view_count;
metadata.author = basicInfo.author;
}
} catch (e: unknown) {
throw new Error(
`Failed to get YouTube video transcription: ${(e as Error).message}`
);
}
const document = new Document({
pageContent: transcript.map((item) => item.text).join(" "),
pageContent: transcript,
metadata,
});

Expand Down
59 changes: 19 additions & 40 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8691,6 +8691,13 @@ __metadata:
languageName: node
linkType: hard

"@bufbuild/protobuf@npm:^2.0.0":
version: 2.2.3
resolution: "@bufbuild/protobuf@npm:2.2.3"
checksum: 567ca0497669a8944fe84a9fdfa236e4a91d5879190c0ec0c8727d5220cbc21a85d06a114ac1eb35387fc5cb1dcbb7adc583c4d4f6a2ecb34fbe61dcaa7e7e9b
languageName: node
linkType: hard

"@cerebras/cerebras_cloud_sdk@npm:^1.15.0":
version: 1.15.0
resolution: "@cerebras/cerebras_cloud_sdk@npm:1.15.0"
Expand Down Expand Up @@ -12040,8 +12047,7 @@ __metadata:
weaviate-ts-client: ^1.4.0
web-auth-library: ^1.0.3
word-extractor: ^1.0.4
youtube-transcript: ^1.0.6
youtubei.js: ^9.1.0
youtubei.js: ^12.2.0
zod: ^3.22.3
zod-to-json-schema: ^3.22.5
peerDependencies:
Expand Down Expand Up @@ -12170,8 +12176,7 @@ __metadata:
web-auth-library: ^1.0.3
word-extractor: "*"
ws: ^8.14.2
youtube-transcript: ^1.0.6
youtubei.js: ^9.1.0
youtubei.js: "*"
peerDependenciesMeta:
"@arcjet/redact":
optional: true
Expand Down Expand Up @@ -12413,8 +12418,6 @@ __metadata:
optional: true
ws:
optional: true
youtube-transcript:
optional: true
youtubei.js:
optional: true
languageName: unknown
Expand Down Expand Up @@ -23126,13 +23129,6 @@ __metadata:
languageName: node
linkType: hard

"centra@npm:^2.6.0":
version: 2.6.0
resolution: "centra@npm:2.6.0"
checksum: 3b4d44762bceb9e20f7e45d01ffb9e462523cf8a0186f6710c08863f0455bceabfbcb754d6b01ea095c3bdee09c4ebef912669dc2b391a9af400e9ba7e398bc5
languageName: node
linkType: hard

"chalk@npm:5.2.0, chalk@npm:^5.0.0, chalk@npm:^5.2.0":
version: 5.2.0
resolution: "chalk@npm:5.2.0"
Expand Down Expand Up @@ -32880,12 +32876,12 @@ __metadata:
languageName: node
linkType: hard

"jintr@npm:^1.1.0":
version: 1.1.0
resolution: "jintr@npm:1.1.0"
"jintr@npm:^3.2.0":
version: 3.2.0
resolution: "jintr@npm:3.2.0"
dependencies:
acorn: ^8.8.0
checksum: b61269ff80a46c71e837e893a4754fc2d0a941e3d577dc6307f0e67cebebf81e66f646c86bf6159fe7d851d829595d7a9e9e26392b9ede7b6b39d9664f1d090d
checksum: 8f526719fd77d6f7cd52c47c06c86573cb37a15e22ce8129a228ff605d7ea3d662d7c8ef37cad7b4df767f53ca11418ffa49ad4aa8776f62d94362aba8317ff3
languageName: node
linkType: hard

Expand Down Expand Up @@ -37265,15 +37261,6 @@ __metadata:
languageName: node
linkType: hard

"phin@npm:^3.5.0":
version: 3.7.0
resolution: "phin@npm:3.7.0"
dependencies:
centra: ^2.6.0
checksum: b0a35e943615c40a3ccd7d6a2dd062568258e6b36dceed3150d13d28cad906e9028e756ad6efe66963b43937879e8a3593f986d17aac968d42982b4e8702e539
languageName: node
linkType: hard

"pickleparser@npm:^0.2.1":
version: 0.2.1
resolution: "pickleparser@npm:0.2.1"
Expand Down Expand Up @@ -44833,23 +44820,15 @@ __metadata:
languageName: node
linkType: hard

"youtube-transcript@npm:^1.0.6":
version: 1.0.6
resolution: "youtube-transcript@npm:1.0.6"
dependencies:
phin: ^3.5.0
checksum: 7ca6a608834d2eb43d2d353ad58bb3fa86663e2f5730146a768c5c3ac423911680451a38c57f827aa7af8fb7df78a4ce3702019d988d87d9ed266f9d81aeb833
languageName: node
linkType: hard

"youtubei.js@npm:^9.1.0":
version: 9.1.0
resolution: "youtubei.js@npm:9.1.0"
"youtubei.js@npm:^12.2.0":
version: 12.2.0
resolution: "youtubei.js@npm:12.2.0"
dependencies:
jintr: ^1.1.0
"@bufbuild/protobuf": ^2.0.0
jintr: ^3.2.0
tslib: ^2.5.0
undici: ^5.19.1
checksum: 7a537d79435c362c3d4f0e101f85edca6b34c584b9cafeee28c4214fdcdcbb6b2ebba2571175e21a984cc5d66d0fe673d761f400dd232ecb16803bce878cb41d
checksum: 4c89a019c6b94363328e8d0d35b8d8266de1ee3db963a39b655bdaa15e4d899a107876ead53b7a1268837b9a756fecaf53be0b399545a7fe290c6da303010c8f
languageName: node
linkType: hard

Expand Down

0 comments on commit d92af44

Please sign in to comment.