From 895d91c0e1ff184ce4078b202f95b815841fcf4c Mon Sep 17 00:00:00 2001 From: Justin Milner Date: Sun, 12 May 2024 18:16:06 -0400 Subject: [PATCH 01/10] Drafted max depth changes --- binary/src/core.ts | 4 ++- core/indexing/docs/crawl.ts | 38 +++++++++++++++++++- core/indexing/docs/index.ts | 3 +- core/indexing/docs/preIndexedDocs.ts | 3 +- core/protocol.ts | 3 +- extensions/vscode/src/webviewProtocol.ts | 5 +-- gui/src/components/dialogs/AddDocsDialog.tsx | 14 ++++++-- 7 files changed, 61 insertions(+), 9 deletions(-) diff --git a/binary/src/core.ts b/binary/src/core.ts index 571f035461..e224b89973 100644 --- a/binary/src/core.ts +++ b/binary/src/core.ts @@ -128,10 +128,12 @@ export class Core { // Context providers on("context/addDocs", async (msg) => { + const { startUrl, title, maxDepth } = msg.data; for await (const _ of indexDocs( msg.data.title, - new URL(msg.data.url), + new URL(msg.data.rootUrl), new TransformersJsEmbeddingsProvider(), + msg.data.maxDepth )) { } }); diff --git a/core/indexing/docs/crawl.ts b/core/indexing/docs/crawl.ts index e62c8d76e8..34335f094d 100644 --- a/core/indexing/docs/crawl.ts +++ b/core/indexing/docs/crawl.ts @@ -129,7 +129,7 @@ export type PageData = { html: string; }; -export async function* crawlPage(url: URL): AsyncGenerator { +export async function* crawlPage(url: URL, maxDepth?: number): AsyncGenerator { const { baseUrl, basePath } = splitUrl(url); let paths: string[] = [basePath]; @@ -172,3 +172,39 @@ export async function* crawlPage(url: URL): AsyncGenerator { ); } } + +export async function* crawlPage2(url: URL, maxDepth: number = 3): AsyncGenerator { + const { baseUrl, basePath } = splitUrl(url); + let paths: { path: string; depth: number }[] = [{ path: basePath, depth: 0 }]; + + let index = 0; + + while (index < paths.length) { + const batch = paths.slice(index, index + 50); + + const promises = batch.map(({ path, depth }) => getLinksFromUrl(baseUrl, path).then(links => ({ links, path, depth }))); // Adjust for depth tracking + + const results = await Promise.all(promises); + + for (const { links: { html, links: linksArray }, path, depth } of results) { + if (html !== "" && depth <= maxDepth) { // Check depth + yield { + url: url.toString(), + path, + html, + }; + } + + // Ensure we only add links if within depth limit + if (depth < maxDepth) { + for (let link of linksArray) { + if (!paths.some(p => p.path === link)) { + paths.push({ path: link, depth: depth + 1 }); // Increment depth for new paths + } + } + } + } + + index += batch.length; // Proceed to next batch + } +} diff --git a/core/indexing/docs/index.ts b/core/indexing/docs/index.ts index 255036bf24..96db7c4fd7 100644 --- a/core/indexing/docs/index.ts +++ b/core/indexing/docs/index.ts @@ -8,6 +8,7 @@ export async function* indexDocs( title: string, baseUrl: URL, embeddingsProvider: EmbeddingsProvider, + maxDepth?: number ): AsyncGenerator { if (await hasDoc(baseUrl.toString())) { yield { @@ -26,7 +27,7 @@ export async function* indexDocs( const articles: Article[] = []; - for await (const page of crawlPage(baseUrl)) { + for await (const page of crawlPage(baseUrl, maxDepth)) { const article = pageToArticle(page); if (!article) continue; diff --git a/core/indexing/docs/preIndexedDocs.ts b/core/indexing/docs/preIndexedDocs.ts index 1a1d9ecbb9..0d8ae3a9b9 100644 --- a/core/indexing/docs/preIndexedDocs.ts +++ b/core/indexing/docs/preIndexedDocs.ts @@ -1,7 +1,8 @@ export interface SiteIndexingConfig { startUrl: string; - title: string; rootUrl: string; + title: string; + maxDepth?: number; } const configs: SiteIndexingConfig[] = [ diff --git a/core/protocol.ts b/core/protocol.ts index 3e37b00ee9..fc75b919e3 100644 --- a/core/protocol.ts +++ b/core/protocol.ts @@ -12,6 +12,7 @@ import { SessionInfo, } from "."; import { AutocompleteInput } from "./autocomplete/completionProvider"; +import { SiteIndexingConfig } from "./indexing/docs/preIndexedDocs"; import { IdeProtocol } from "./web/webviewProtocol"; export type ProtocolGeneratorType = AsyncGenerator<{ @@ -62,7 +63,7 @@ export type Protocol = { { title: string }, Promise, ]; - "context/addDocs": [{ title: string; url: string }, void]; + "context/addDocs": [SiteIndexingConfig, void]; "autocomplete/complete": [AutocompleteInput, Promise]; "autocomplete/cancel": [undefined, void]; "autocomplete/accept": [{ completionId: string }, void]; diff --git a/extensions/vscode/src/webviewProtocol.ts b/extensions/vscode/src/webviewProtocol.ts index a7cf5cb784..84ce86c28e 100644 --- a/extensions/vscode/src/webviewProtocol.ts +++ b/extensions/vscode/src/webviewProtocol.ts @@ -555,7 +555,7 @@ export class VsCodeWebviewProtocol { } }); this.on("context/addDocs", (msg) => { - const { url, title } = msg.data; + const { startUrl, title, maxDepth } = msg.data; const embeddingsProvider = new TransformersJsEmbeddingsProvider(); vscode.window.withProgress( { @@ -566,8 +566,9 @@ export class VsCodeWebviewProtocol { async (progress) => { for await (const update of indexDocs( title, - new URL(url), + new URL(startUrl), embeddingsProvider, + maxDepth )) { progress.report({ increment: update.progress, diff --git a/gui/src/components/dialogs/AddDocsDialog.tsx b/gui/src/components/dialogs/AddDocsDialog.tsx index 67ded9c22a..c6fcc39157 100644 --- a/gui/src/components/dialogs/AddDocsDialog.tsx +++ b/gui/src/components/dialogs/AddDocsDialog.tsx @@ -15,9 +15,13 @@ const GridDiv = styled.div` `; function AddDocsDialog() { + const defaultMaxDepth = 4 const [docsUrl, setDocsUrl] = React.useState(""); const [docsTitle, setDocsTitle] = React.useState(""); const [urlValid, setUrlValid] = React.useState(false); + const [maxDepth, setMaxDepth] = React.useState(defaultMaxDepth); + const [maxDepthValid, setMaxDepthValid] = React.useState(false) // ToDo + const dispatch = useDispatch(); const { addItem } = useContext(SubmenuContextProvidersContext); @@ -60,14 +64,20 @@ function AddDocsDialog() { value={docsTitle} onChange={(e) => setDocsTitle(e.target.value)} /> - + setMaxDepth(Number(e.target.value))} + />