Skip to content

Commit

Permalink
Add ability to grab youtube transcripts via doc processor (#470)
Browse files Browse the repository at this point in the history
* Add ability to grab youtube transcripts via doc processor

* dynamic imports
swap out Github for Youtube in placeholder text
  • Loading branch information
timothycarambat authored Dec 19, 2023
1 parent 4525824 commit ecf4295
Show file tree
Hide file tree
Showing 19 changed files with 353 additions and 17 deletions.
19 changes: 19 additions & 0 deletions collector/extensions/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,25 @@ function extensions(app) {
}
return;
});

app.post("/ext/youtube-transcript", async function (request, response) {
try {
const loadYouTubeTranscript = require("../utils/extensions/YoutubeTranscript");
const { success, reason, data } = await loadYouTubeTranscript(reqBody(request));
response.status(200).json({ success, reason, data });
} catch (e) {
console.error(e);
response.status(400).json({
success: false,
reason: e.message,
data: {
title: null,
author: null
}
});
}
return;
});
}

module.exports = extensions;
4 changes: 3 additions & 1 deletion collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
"slugify": "^1.6.6",
"url-pattern": "^1.0.3",
"uuid": "^9.0.0",
"wavefile": "^11.0.0"
"wavefile": "^11.0.0",
"youtube-transcript": "^1.0.6",
"youtubei.js": "^8.0.0"
},
"devDependencies": {
"nodemon": "^2.0.22",
Expand Down
95 changes: 95 additions & 0 deletions collector/utils/extensions/YoutubeTranscript/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
const { YoutubeLoader } = require("langchain/document_loaders/web/youtube");
const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");

function validYoutubeVideoUrl(url) {
const UrlPattern = require("url-pattern");

const shortPatternMatch = new UrlPattern(
"https\\://youtu.be/(:videoId)"
).match(url);
const fullPatternMatch = new UrlPattern(
"https\\://(www.)youtube.com/watch?v=(:videoId)"
).match(url);
const videoId =
shortPatternMatch?.videoId || fullPatternMatch?.videoId || null;
if (!!videoId) return true;

return false;
}

async function loadYouTubeTranscript({ url }) {
if (!validYoutubeVideoUrl(url)) {
return {
success: false,
reason: "Invalid URL. Should be youtu.be or youtube.com/watch.",
};
}

console.log(`-- Working YouTube ${url} --`);
const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
const docs = await loader.load();

if (!docs.length) {
return {
success: false,
reason: "No transcript found for that YouTube video.",
};
}

const metadata = docs[0].metadata;
let content = "";
docs.forEach((doc) => (content = content.concat(doc.pageContent)));

if (!content.length) {
return {
success: false,
reason: "No transcript could be parsed for that YouTube video.",
};
}

const outFolder = slugify(
`${metadata.author} YouTube transcripts`
).toLowerCase();
const outFolderPath = path.resolve(
__dirname,
`../../../../server/storage/documents/${outFolder}`
);
if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath);

const data = {
id: v4(),
url: url + ".youtube",
title: metadata.title || url,
docAuthor: metadata.author,
description: metadata.description,
docSource: url,
chunkSource: url,
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};

console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
writeToServerDocuments(
data,
`${slugify(metadata.title)}-${data.id}`,
outFolderPath
);

return {
success: true,
reason: "test",
data: {
title: metadata.title,
author: metadata.author,
},
};
}

module.exports = loadYouTubeTranscript;
54 changes: 53 additions & 1 deletion collector/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
chalk "^2.4.2"
js-tokens "^4.0.0"

"@fastify/busboy@^2.0.0":
version "2.1.0"
resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.0.tgz#0709e9f4cb252351c609c6e6d8d6779a8d25edff"
integrity sha512-+KpH+QxZU7O4675t3mnkQKcZZg56u+K/Ct2K+N2AZYNVK8kyeo/bI18tI8aPm3tvNNRyTWfj6s5tnGNlcbQRsA==

"@googleapis/youtube@^9.0.0":
version "9.0.0"
resolved "https://registry.yarnpkg.com/@googleapis/youtube/-/youtube-9.0.0.tgz#e45f6f5f7eac198c6391782b94b3ca54bacf0b63"
Expand Down Expand Up @@ -252,6 +257,11 @@ accepts@~1.3.8:
mime-types "~2.1.34"
negotiator "0.6.3"

acorn@^8.8.0:
version "8.11.2"
resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.11.2.tgz#ca0d78b51895be5390a5903c5b3bdcdaf78ae40b"
integrity sha512-nc0Axzp/0FILLEVsm4fNwLCwMttvhEI263QtVPQcbpfZZ3ts0hLsZGOpE6czNlid7CJ9MlyH8reXkpsf3YUY4w==

agent-base@6:
version "6.0.2"
resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-6.0.2.tgz#49fff58577cfee3f37176feab4c22e00f86d7f77"
Expand Down Expand Up @@ -554,6 +564,11 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==

centra@^2.6.0:
version "2.6.0"
resolved "https://registry.yarnpkg.com/centra/-/centra-2.6.0.tgz#79117998ee6908642258db263871381aa5d1204a"
integrity sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ==

chalk@^2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
Expand Down Expand Up @@ -1655,6 +1670,13 @@ isexe@^2.0.0:
resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10"
integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==

jintr@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/jintr/-/jintr-1.1.0.tgz#223a3b07f5e03d410cec6e715c537c8ad1e714c3"
integrity sha512-Tu9wk3BpN2v+kb8yT6YBtue+/nbjeLFv4vvVC4PJ7oCidHKbifWhvORrAbQfxVIQZG+67am/mDagpiGSVtvrZg==
dependencies:
acorn "^8.8.0"

js-tiktoken@^1.0.7:
version "1.0.7"
resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.7.tgz#56933fcd2093e8304060dfde3071bda91812e6f5"
Expand Down Expand Up @@ -2431,6 +2453,13 @@ pend@~1.2.0:
resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==

phin@^3.5.0:
version "3.7.0"
resolved "https://registry.yarnpkg.com/phin/-/phin-3.7.0.tgz#eeeff7660408515d8cf0c6252901012d4ab7153b"
integrity sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg==
dependencies:
centra "^2.6.0"

picomatch@^2.0.4, picomatch@^2.2.1:
version "2.3.1"
resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42"
Expand Down Expand Up @@ -3069,7 +3098,7 @@ tr46@~0.0.3:
resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a"
integrity sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==

tslib@^2.0.1:
tslib@^2.0.1, tslib@^2.5.0:
version "2.6.2"
resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.6.2.tgz#703ac29425e7b37cd6fd456e92404d46d1f3e4ae"
integrity sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==
Expand Down Expand Up @@ -3122,6 +3151,13 @@ undici-types@~5.26.4:
resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.26.5.tgz#bcd539893d00b56e964fd2657a4866b221a65617"
integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==

undici@^5.19.1:
version "5.28.2"
resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.2.tgz#fea200eac65fc7ecaff80a023d1a0543423b4c91"
integrity sha512-wh1pHJHnUeQV5Xa8/kyQhO7WFa8M34l026L5P/+2TYiakvGy5Rdc8jWZVyG7ieht/0WgJLEd3kcU5gKx+6GC8w==
dependencies:
"@fastify/busboy" "^2.0.0"

universalify@^0.1.0:
version "0.1.2"
resolved "https://registry.yarnpkg.com/universalify/-/universalify-0.1.2.tgz#b646f69be3942dabcecc9d6639c80dc105efaa66"
Expand Down Expand Up @@ -3279,6 +3315,22 @@ yauzl@^2.10.0, yauzl@^2.4.2:
buffer-crc32 "~0.2.3"
fd-slicer "~1.1.0"

youtube-transcript@^1.0.6:
version "1.0.6"
resolved "https://registry.yarnpkg.com/youtube-transcript/-/youtube-transcript-1.0.6.tgz#8414c04380d3ef1102bd00ca3729e94c46ae7a14"
integrity sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA==
dependencies:
phin "^3.5.0"

youtubei.js@^8.0.0:
version "8.0.0"
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-8.0.0.tgz#0fcbe332e263d9be6afe4e3d1917e9ddc1ffbed3"
integrity sha512-kUwHvqoB5vfaGaY1quAGcX5JPIyjr5fjj9Zj/ZwUDCrermz/r5uIkNiJ5cNHkmAJbZP9fdygzNMvGHd7fM445g==
dependencies:
jintr "^1.1.0"
tslib "^2.5.0"
undici "^5.19.1"

[email protected]:
version "3.20.3"
resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.20.3.tgz#8c95d8c20f20455ffa0b4b526c29703f35f6d787"
Expand Down
8 changes: 8 additions & 0 deletions frontend/src/components/DataConnectorOption/index.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,12 @@ export const DATA_CONNECTORS = {
"Import an entire public or private Github repository in a single click.",
link: "https://github.com",
},
"youtube-transcript": {
name: "YouTube Transcript",
path: paths.settings.dataConnectors.youtubeTranscript(),
image: ConnectorImages.youtube,
description:
"Import the transcription of an entire YouTube video from a link.",
link: "https://youtube.com",
},
};
4 changes: 4 additions & 0 deletions frontend/src/components/DataConnectorOption/media/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import Github from "./github.png";
import YouTube from "./youtube.png";

const ConnectorImages = {
github: Github,
youtube: YouTube,
};

export default ConnectorImages;
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,19 @@ export default function FileRow({
selected ? "bg-sky-500/20" : ""
} ${expanded ? "bg-sky-500/10" : ""}`}`}
>
<div className="pl-4 col-span-4 flex gap-x-[4px] items-center">
<div className="pl-2 col-span-6 flex gap-x-[4px] items-center">
<div
className="w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
className="shrink-0 w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
role="checkbox"
aria-checked={selected}
tabIndex={0}
>
{selected && <div className="w-2 h-2 bg-white rounded-[2px]" />}
</div>
<File className="text-base font-bold w-4 h-4 mr-[3px]" weight="fill" />
<File
className="shrink-0 text-base font-bold w-4 h-4 mr-[3px]"
weight="fill"
/>
<div
className="relative"
onMouseEnter={handleMouseEnter}
Expand All @@ -88,7 +91,6 @@ export default function FileRow({
<p className="col-span-2 pl-3.5 whitespace-nowrap">
{formatDate(item?.published)}
</p>
<p className="col-span-2 pl-3">{item?.size || "---"}</p>
<p className="col-span-2 pl-2 uppercase">{getFileExtension(item.url)}</p>
<div className="col-span-2 flex justify-end items-center">
{item?.cached && (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ export default function FolderRow({
selected ? "bg-sky-500/20" : ""
}`}
>
<div className="col-span-4 flex gap-x-[4px] items-center">
<div className="col-span-6 flex gap-x-[4px] items-center">
<div
className="shrink-0 w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
role="checkbox"
Expand All @@ -79,7 +79,6 @@ export default function FolderRow({
</p>
</div>
<p className="col-span-2 pl-3.5" />
<p className="col-span-2 pl-3" />
<p className="col-span-2 pl-2" />
<div className="col-span-2 flex justify-end items-center">
{item.name !== "custom-documents" && (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,8 @@ export default function Directory({

<div className="relative w-[560px] h-[310px] bg-zinc-900 rounded-2xl">
<div className="rounded-t-2xl text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20 shadow-lg bg-zinc-900 sticky top-0 z-10">
<p className="col-span-4">Name</p>
<p className="col-span-6">Name</p>
<p className="col-span-2">Date</p>
<p className="col-span-2">Size</p>
<p className="col-span-2">Kind</p>
<p className="col-span-2">Cached</p>
</div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ export default function WorkspaceFileRow({
className={`items-center transition-all duration-200 text-white/80 text-xs grid grid-cols-12 py-2 pl-3.5 pr-8 border-b border-white/20 hover:bg-sky-500/20 cursor-pointer
${isMovedItem ? "bg-green-800/40" : ""}`}
>
<div className="col-span-4 flex gap-x-[4px] items-center">
<div className="col-span-6 flex gap-x-[4px] items-center">
<File
className="text-base font-bold w-4 h-4 ml-3 mr-[3px]"
weight="fill"
Expand All @@ -77,7 +77,6 @@ export default function WorkspaceFileRow({
<p className="col-span-2 pl-3.5 whitespace-nowrap">
{formatDate(item?.published)}
</p>
<p className="col-span-2 pl-3">{item?.size || "---"}</p>
<p className="col-span-2 pl-2 uppercase">{getFileExtension(item.url)}</p>
<div className="col-span-2 flex justify-end items-center">
{item?.cached && (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@ export default function WorkspaceDirectory({
</div>
<div className="relative w-[560px] h-[445px] bg-zinc-900 rounded-2xl mt-5">
<div className="text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20">
<p className="col-span-4">Name</p>
<p className="col-span-6">Name</p>
<p className="col-span-2">Date</p>
<p className="col-span-2">Size</p>
<p className="col-span-2">Kind</p>
<p className="col-span-2">Cached</p>
</div>
Expand Down Expand Up @@ -56,9 +55,8 @@ export default function WorkspaceDirectory({
}`}
>
<div className="text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20 bg-zinc-900 sticky top-0 z-10">
<p className="col-span-4">Name</p>
<p className="col-span-6">Name</p>
<p className="col-span-2">Date</p>
<p className="col-span-2">Size</p>
<p className="col-span-2">Kind</p>
<p className="col-span-2">Cached</p>
</div>
Expand Down
18 changes: 18 additions & 0 deletions frontend/src/models/dataConnector.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,24 @@ const DataConnector = {
});
},
},
youtube: {
transcribe: async ({ url }) => {
return await fetch(`${API_BASE}/ext/youtube/transcript`, {
method: "POST",
headers: baseHeaders(),
body: JSON.stringify({ url }),
})
.then((res) => res.json())
.then((res) => {
if (!res.success) throw new Error(res.reason);
return { data: res.data, error: null };
})
.catch((e) => {
console.error(e);
return { data: null, error: e.message };
});
},
},
};

export default DataConnector;
Loading

0 comments on commit ecf4295

Please sign in to comment.