Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(community): update YoutubeLoader implementation #7477

Merged
merged 2 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
"readline": "https://deno.land/x/[email protected]/mod.ts",
"uuid": "npm:/uuid",
"youtubei.js": "npm:/youtubei.js",
"youtube-transcript": "npm:/youtube-transcript",
"neo4j-driver": "npm:/neo4j-driver",
"axios": "npm:/axios",
"@mendable/firecrawl-js": "npm:/@mendable/firecrawl-js",
Expand All @@ -40,4 +39,4 @@
"@smithy/util-utf8": "npm:/@smithy/util-utf8",
"@aws-sdk/types": "npm:/@aws-sdk/types"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@ hide_table_of_contents: true

# YouTube transcripts

This covers how to load youtube transcript into LangChain documents.
This covers how to load YouTube transcripts into LangChain documents.

## Setup

You'll need to install the [youtube-transcript](https://www.npmjs.com/package/youtube-transcript) package
and [youtubei.js](https://www.npmjs.com/package/youtubei.js) to extract metadata:
You'll need to install the [youtubei.js](https://www.npmjs.com/package/youtubei.js) to extract metadata:

```bash npm2yarn
npm install @langchain/community @langchain/core youtube-transcript youtubei.js
npm install @langchain/community @langchain/core youtubei.js
```

## Usage
Expand Down
9 changes: 2 additions & 7 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,7 @@
"weaviate-ts-client": "^1.4.0",
"web-auth-library": "^1.0.3",
"word-extractor": "^1.0.4",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^9.1.0"
"youtubei.js": "^12.2.0"
},
"peerDependencies": {
"@arcjet/redact": "^v1.0.0-alpha.23",
Expand Down Expand Up @@ -348,8 +347,7 @@
"web-auth-library": "^1.0.3",
"word-extractor": "*",
"ws": "^8.14.2",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^9.1.0"
"youtubei.js": "*"
},
"peerDependenciesMeta": {
"@arcjet/redact": {
Expand Down Expand Up @@ -712,9 +710,6 @@
"ws": {
"optional": true
},
"youtube-transcript": {
"optional": true
},
"youtubei.js": {
"optional": true
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { test, expect } from "@jest/globals";
import { YoutubeLoader } from "../web/youtube.js";

test("Test Youtube loader", async () => {
const videoUrl = "https://www.youtube.com/watch?v=FZhbJZEgKQ4";
const loader = YoutubeLoader.createFromUrl(videoUrl, {
language: "en",
addVideoInfo: true,
});
const docs = await loader.load();

expect(docs.length).toBe(1);
expect(docs[0].pageContent).toContain(
"One year ago, at the dawn of a new age,"
);
expect(docs[0].metadata).toMatchObject({
author: "Microsoft",
source: "FZhbJZEgKQ4",
title: "Full Keynote: Satya Nadella at Microsoft Ignite 2023",
});
});
31 changes: 17 additions & 14 deletions libs/langchain-community/src/document_loaders/web/youtube.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { TranscriptResponse, YoutubeTranscript } from "youtube-transcript";
import { Innertube } from "youtubei.js";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
Expand Down Expand Up @@ -28,8 +27,7 @@ interface VideoMetadata {

/**
* A document loader for loading data from YouTube videos. It uses the
* youtube-transcript and youtubei.js libraries to fetch the transcript
* and video metadata.
* youtubei.js library to fetch the transcript and video metadata.
* @example
* ```typescript
* const loader = new YoutubeLoader(
Expand Down Expand Up @@ -87,37 +85,42 @@ export class YoutubeLoader extends BaseDocumentLoader {

/**
* Loads the transcript and video metadata from the specified YouTube
* video. It uses the youtube-transcript library to fetch the transcript
* and the youtubei.js library to fetch the video metadata.
* video. It uses the youtubei.js library to fetch the video metadata and transcripts.
* @returns An array of Documents representing the retrieved data.
*/
async load(): Promise<Document[]> {
let transcript: TranscriptResponse[] | undefined;
let transcript: string | undefined;
const metadata: VideoMetadata = {
source: this.videoId,
};
try {
transcript = await YoutubeTranscript.fetchTranscript(this.videoId, {
const youtube = await Innertube.create({
lang: this.language,
retrieve_player: false,
});
const info = await youtube.getInfo(this.videoId);
const transcriptData = await info.getTranscript();
transcript =
transcriptData.transcript.content?.body?.initial_segments
.map((segment) => segment.snippet.text)
.join(" ") ?? "";
if (transcript === undefined) {
throw new Error("Transcription not found");
}
if (this.addVideoInfo) {
const youtube = await Innertube.create();
const info = (await youtube.getBasicInfo(this.videoId)).basic_info;
metadata.description = info.short_description;
metadata.title = info.title;
metadata.view_count = info.view_count;
metadata.author = info.author;
const basicInfo = info.basic_info;
metadata.description = basicInfo.short_description;
metadata.title = basicInfo.title;
metadata.view_count = basicInfo.view_count;
metadata.author = basicInfo.author;
}
} catch (e: unknown) {
throw new Error(
`Failed to get YouTube video transcription: ${(e as Error).message}`
);
}
const document = new Document({
pageContent: transcript.map((item) => item.text).join(" "),
pageContent: transcript,
metadata,
});

Expand Down
59 changes: 19 additions & 40 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8691,6 +8691,13 @@ __metadata:
languageName: node
linkType: hard

"@bufbuild/protobuf@npm:^2.0.0":
version: 2.2.3
resolution: "@bufbuild/protobuf@npm:2.2.3"
checksum: 567ca0497669a8944fe84a9fdfa236e4a91d5879190c0ec0c8727d5220cbc21a85d06a114ac1eb35387fc5cb1dcbb7adc583c4d4f6a2ecb34fbe61dcaa7e7e9b
languageName: node
linkType: hard

"@cerebras/cerebras_cloud_sdk@npm:^1.15.0":
version: 1.15.0
resolution: "@cerebras/cerebras_cloud_sdk@npm:1.15.0"
Expand Down Expand Up @@ -12040,8 +12047,7 @@ __metadata:
weaviate-ts-client: ^1.4.0
web-auth-library: ^1.0.3
word-extractor: ^1.0.4
youtube-transcript: ^1.0.6
youtubei.js: ^9.1.0
youtubei.js: ^12.2.0
zod: ^3.22.3
zod-to-json-schema: ^3.22.5
peerDependencies:
Expand Down Expand Up @@ -12170,8 +12176,7 @@ __metadata:
web-auth-library: ^1.0.3
word-extractor: "*"
ws: ^8.14.2
youtube-transcript: ^1.0.6
youtubei.js: ^9.1.0
youtubei.js: "*"
peerDependenciesMeta:
"@arcjet/redact":
optional: true
Expand Down Expand Up @@ -12413,8 +12418,6 @@ __metadata:
optional: true
ws:
optional: true
youtube-transcript:
optional: true
youtubei.js:
optional: true
languageName: unknown
Expand Down Expand Up @@ -23126,13 +23129,6 @@ __metadata:
languageName: node
linkType: hard

"centra@npm:^2.6.0":
version: 2.6.0
resolution: "centra@npm:2.6.0"
checksum: 3b4d44762bceb9e20f7e45d01ffb9e462523cf8a0186f6710c08863f0455bceabfbcb754d6b01ea095c3bdee09c4ebef912669dc2b391a9af400e9ba7e398bc5
languageName: node
linkType: hard

"chalk@npm:5.2.0, chalk@npm:^5.0.0, chalk@npm:^5.2.0":
version: 5.2.0
resolution: "chalk@npm:5.2.0"
Expand Down Expand Up @@ -32880,12 +32876,12 @@ __metadata:
languageName: node
linkType: hard

"jintr@npm:^1.1.0":
version: 1.1.0
resolution: "jintr@npm:1.1.0"
"jintr@npm:^3.2.0":
version: 3.2.0
resolution: "jintr@npm:3.2.0"
dependencies:
acorn: ^8.8.0
checksum: b61269ff80a46c71e837e893a4754fc2d0a941e3d577dc6307f0e67cebebf81e66f646c86bf6159fe7d851d829595d7a9e9e26392b9ede7b6b39d9664f1d090d
checksum: 8f526719fd77d6f7cd52c47c06c86573cb37a15e22ce8129a228ff605d7ea3d662d7c8ef37cad7b4df767f53ca11418ffa49ad4aa8776f62d94362aba8317ff3
languageName: node
linkType: hard

Expand Down Expand Up @@ -37265,15 +37261,6 @@ __metadata:
languageName: node
linkType: hard

"phin@npm:^3.5.0":
version: 3.7.0
resolution: "phin@npm:3.7.0"
dependencies:
centra: ^2.6.0
checksum: b0a35e943615c40a3ccd7d6a2dd062568258e6b36dceed3150d13d28cad906e9028e756ad6efe66963b43937879e8a3593f986d17aac968d42982b4e8702e539
languageName: node
linkType: hard

"pickleparser@npm:^0.2.1":
version: 0.2.1
resolution: "pickleparser@npm:0.2.1"
Expand Down Expand Up @@ -44833,23 +44820,15 @@ __metadata:
languageName: node
linkType: hard

"youtube-transcript@npm:^1.0.6":
version: 1.0.6
resolution: "youtube-transcript@npm:1.0.6"
dependencies:
phin: ^3.5.0
checksum: 7ca6a608834d2eb43d2d353ad58bb3fa86663e2f5730146a768c5c3ac423911680451a38c57f827aa7af8fb7df78a4ce3702019d988d87d9ed266f9d81aeb833
languageName: node
linkType: hard

"youtubei.js@npm:^9.1.0":
version: 9.1.0
resolution: "youtubei.js@npm:9.1.0"
"youtubei.js@npm:^12.2.0":
version: 12.2.0
resolution: "youtubei.js@npm:12.2.0"
dependencies:
jintr: ^1.1.0
"@bufbuild/protobuf": ^2.0.0
jintr: ^3.2.0
tslib: ^2.5.0
undici: ^5.19.1
checksum: 7a537d79435c362c3d4f0e101f85edca6b34c584b9cafeee28c4214fdcdcbb6b2ebba2571175e21a984cc5d66d0fe673d761f400dd232ecb16803bce878cb41d
checksum: 4c89a019c6b94363328e8d0d35b8d8266de1ee3db963a39b655bdaa15e4d899a107876ead53b7a1268837b9a756fecaf53be0b399545a7fe290c6da303010c8f
languageName: node
linkType: hard

Expand Down
Loading