From 88dd1ad77363b58e3271c20334776324f77ed1a1 Mon Sep 17 00:00:00 2001 From: wyang14 Date: Sun, 28 Apr 2024 17:55:30 +0100 Subject: [PATCH] bugfix: recursive url loader should not skip on urls not trailing with slash --- server/docstore/redis.ts | 99 +++++++++++++++--------------- server/utils/recursiveUrlLoader.ts | 31 ++++++---- 2 files changed, 68 insertions(+), 62 deletions(-) diff --git a/server/docstore/redis.ts b/server/docstore/redis.ts index f8c39f3..a6149a1 100644 --- a/server/docstore/redis.ts +++ b/server/docstore/redis.ts @@ -1,61 +1,60 @@ -import { Document } from "@langchain/core/documents"; -import { BaseStoreInterface } from "@langchain/core/stores"; -import { Redis } from "ioredis"; -import { createRedisClient } from "@/server/store/redis"; +import { Document } from "@langchain/core/documents" +import { BaseStoreInterface } from "@langchain/core/stores" +import { Redis } from "ioredis" +import { createRedisClient } from "@/server/store/redis" -export class RedisDocstore implements BaseStoreInterface -{ - _namespace: string; - _client: Redis; +export class RedisDocstore implements BaseStoreInterface { + _namespace: string + _client: Redis constructor(namespace: string) { - this._namespace = namespace; - this._client = createRedisClient(); + this._namespace = namespace + this._client = createRedisClient() } serializeDocument(doc: Document): string { - return JSON.stringify(doc); + return JSON.stringify(doc) } deserializeDocument(jsonString: string): Document { - const obj = JSON.parse(jsonString); - return new Document(obj); + const obj = JSON.parse(jsonString) + return new Document(obj) } getNamespacedKey(key: string): string { - return `${this._namespace}:${key}`; + return `${this._namespace}:${key}` } getKeys(): Promise { return new Promise((resolve, reject) => { - const stream = this._client.scanStream({ match: this._namespace + '*' }); + const stream = this._client.scanStream({ match: this._namespace + '*' }) - const keys: string[] = []; + const keys: string[] = [] stream.on('data', (resultKeys) => { - keys.push(...resultKeys); - }); + keys.push(...resultKeys) + }) stream.on('end', () => { - resolve(keys); - }); + resolve(keys) + }) stream.on('error', (err) => { - reject(err); - }); - }); + reject(err) + }) + }) } addText(key: string, value: string) { - this._client.set(this.getNamespacedKey(key), value); + this._client.set(this.getNamespacedKey(key), value) } async search(search: string): Promise { - const result = await this._client.get(this.getNamespacedKey(search)); + const result = await this._client.get(this.getNamespacedKey(search)) if (!result) { - throw new Error(`ID ${search} not found.`); + throw new Error(`ID ${search} not found.`) } else { - const document = this.deserializeDocument(result); - return document; + const document = this.deserializeDocument(result) + return document } } @@ -66,71 +65,71 @@ export class RedisDocstore implements BaseStoreInterface */ async add(texts: Record): Promise { for (const [key, value] of Object.entries(texts)) { - console.log(`Adding ${key} to the store: ${this.serializeDocument(value)}`); + // console.log(`Adding ${key} to the store: ${this.serializeDocument(value)}`); } - const keys = [...await this.getKeys()]; - const overlapping = Object.keys(texts).filter((x) => keys.includes(x)); + const keys = [...await this.getKeys()] + const overlapping = Object.keys(texts).filter((x) => keys.includes(x)) if (overlapping.length > 0) { - throw new Error(`Tried to add ids that already exist: ${overlapping}`); + throw new Error(`Tried to add ids that already exist: ${overlapping}`) } for (const [key, value] of Object.entries(texts)) { - this.addText(key, this.serializeDocument(value)); + this.addText(key, this.serializeDocument(value)) } } async mget(keys: string[]): Promise { return Promise.all(keys.map((key) => { - const document = this.search(key); - return document; - })); + const document = this.search(key) + return document + })) } async mset(keyValuePairs: [string, Document][]): Promise { await Promise.all( keyValuePairs.map(([key, value]) => this.add({ [key]: value })) - ); + ) } async mdelete(_keys: string[]): Promise { - throw new Error("Not implemented."); + throw new Error("Not implemented.") } // eslint-disable-next-line require-yield async *yieldKeys(_prefix?: string): AsyncGenerator { - throw new Error("Not implemented"); + throw new Error("Not implemented") } async deleteAll(): Promise { return new Promise((resolve, reject) => { - let cursor = '0'; + let cursor = '0' const scanCallback = (err, result) => { if (err) { - reject(err); - return; + reject(err) + return } - const [nextCursor, keys] = result; + const [nextCursor, keys] = result // Delete keys matching the prefix keys.forEach((key) => { - this._client.del(key); - }); + this._client.del(key) + }) // If the cursor is '0', we've iterated through all keys if (nextCursor === '0') { - resolve(); + resolve() } else { // Continue scanning with the next cursor - this._client.scan(nextCursor, 'MATCH', `${this._namespace}:*`, scanCallback); + this._client.scan(nextCursor, 'MATCH', `${this._namespace}:*`, scanCallback) } - }; + } // Start the initial SCAN operation - this._client.scan(cursor, 'MATCH', `${this._namespace}:*`, scanCallback); - }); + this._client.scan(cursor, 'MATCH', `${this._namespace}:*`, scanCallback) + }) } } diff --git a/server/utils/recursiveUrlLoader.ts b/server/utils/recursiveUrlLoader.ts index 4f1f577..a1270bf 100644 --- a/server/utils/recursiveUrlLoader.ts +++ b/server/utils/recursiveUrlLoader.ts @@ -83,6 +83,9 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo const allLinks = Array.from( new JSDOM(html).window.document.querySelectorAll('a') ).map((a) => a.href) + + console.log("All Links: ", allLinks) + const absolutePaths = [] // eslint-disable-next-line no-script-url const invalidPrefixes = ['javascript:', 'mailto:', '#'] @@ -119,11 +122,14 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir)) || this.excludeGlobs.some(r => r.test(standardizedLink)) - ) + ) { + console.log("URL excluded: ", standardizedLink) continue + } if (link.startsWith('http')) { const isAllowed = !this.preventOutside || link.startsWith(baseUrl) + console.log(`URL ${link} is allowed: ${isAllowed}`) if (isAllowed) absolutePaths.push(link) } else if (link.startsWith('//')) { const base = new URL(baseUrl) @@ -180,10 +186,10 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo visited: Set = new Set(), depth = 0 ): Promise { + console.log(`URL ${inputUrl} at depth ${depth}`) if (depth >= this.maxDepth) return [] - let url = inputUrl - if (!inputUrl.endsWith('/')) url += '/' + const url = inputUrl const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir)) || this.excludeGlobs.some(r => r.test(url)) if (isExcluded) return [] @@ -198,6 +204,9 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo const childUrls: string[] = this.getChildLinks(res, url) + console.log("Input URL: ", inputUrl) + console.log("Child URLs: ", childUrls) + const results = await Promise.all( childUrls.map((childUrl) => (async () => { @@ -207,16 +216,14 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo const childDoc = await this.getUrlAsDoc(childUrl) if (!childDoc) return null - if (childUrl.endsWith('/')) { - const childUrlResponses = await this.getChildUrlsRecursive( - childUrl, - visited, - depth + 1 - ) - return [childDoc, ...childUrlResponses] - } - return [childDoc] + const childUrlResponses = await this.getChildUrlsRecursive( + childUrl, + visited, + depth + 1 + ) + + return [childDoc, ...childUrlResponses] })() ) )