Skip to content

Commit

Permalink
Merge pull request #391 from sugarforever/bugfix/missing-urls-in-recu…
Browse files Browse the repository at this point in the history
…rsive-url-loader

bugfix: recursive url loader should not skip on urls not trailing with slash
  • Loading branch information
sugarforever authored Apr 28, 2024
2 parents 3286e01 + 88dd1ad commit c69d846
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 62 deletions.
99 changes: 49 additions & 50 deletions server/docstore/redis.ts
Original file line number Diff line number Diff line change
@@ -1,61 +1,60 @@
import { Document } from "@langchain/core/documents";
import { BaseStoreInterface } from "@langchain/core/stores";
import { Redis } from "ioredis";
import { createRedisClient } from "@/server/store/redis";
import { Document } from "@langchain/core/documents"
import { BaseStoreInterface } from "@langchain/core/stores"
import { Redis } from "ioredis"
import { createRedisClient } from "@/server/store/redis"

export class RedisDocstore implements BaseStoreInterface<string, Document>
{
_namespace: string;
_client: Redis;
export class RedisDocstore implements BaseStoreInterface<string, Document> {
_namespace: string
_client: Redis

constructor(namespace: string) {
this._namespace = namespace;
this._client = createRedisClient();
this._namespace = namespace
this._client = createRedisClient()
}

serializeDocument(doc: Document): string {
return JSON.stringify(doc);
return JSON.stringify(doc)
}

deserializeDocument(jsonString: string): Document {
const obj = JSON.parse(jsonString);
return new Document(obj);
const obj = JSON.parse(jsonString)
return new Document(obj)
}

getNamespacedKey(key: string): string {
return `${this._namespace}:${key}`;
return `${this._namespace}:${key}`
}

getKeys(): Promise<string[]> {
return new Promise((resolve, reject) => {
const stream = this._client.scanStream({ match: this._namespace + '*' });
const stream = this._client.scanStream({ match: this._namespace + '*' })

const keys: string[] = [];
const keys: string[] = []
stream.on('data', (resultKeys) => {
keys.push(...resultKeys);
});
keys.push(...resultKeys)
})

stream.on('end', () => {
resolve(keys);
});
resolve(keys)
})

stream.on('error', (err) => {
reject(err);
});
});
reject(err)
})
})
}

addText(key: string, value: string) {
this._client.set(this.getNamespacedKey(key), value);
this._client.set(this.getNamespacedKey(key), value)
}

async search(search: string): Promise<Document> {
const result = await this._client.get(this.getNamespacedKey(search));
const result = await this._client.get(this.getNamespacedKey(search))
if (!result) {
throw new Error(`ID ${search} not found.`);
throw new Error(`ID ${search} not found.`)
} else {
const document = this.deserializeDocument(result);
return document;
const document = this.deserializeDocument(result)
return document
}
}

Expand All @@ -66,71 +65,71 @@ export class RedisDocstore implements BaseStoreInterface<string, Document>
*/
async add(texts: Record<string, Document>): Promise<void> {
for (const [key, value] of Object.entries(texts)) {
console.log(`Adding ${key} to the store: ${this.serializeDocument(value)}`);
// console.log(`Adding ${key} to the store: ${this.serializeDocument(value)}`);
}

const keys = [...await this.getKeys()];
const overlapping = Object.keys(texts).filter((x) => keys.includes(x));
const keys = [...await this.getKeys()]
const overlapping = Object.keys(texts).filter((x) => keys.includes(x))

if (overlapping.length > 0) {
throw new Error(`Tried to add ids that already exist: ${overlapping}`);
throw new Error(`Tried to add ids that already exist: ${overlapping}`)
}

for (const [key, value] of Object.entries(texts)) {
this.addText(key, this.serializeDocument(value));
this.addText(key, this.serializeDocument(value))
}
}

async mget(keys: string[]): Promise<Document[]> {
return Promise.all(keys.map((key) => {
const document = this.search(key);
return document;
}));
const document = this.search(key)
return document
}))
}

async mset(keyValuePairs: [string, Document][]): Promise<void> {
await Promise.all(
keyValuePairs.map(([key, value]) => this.add({ [key]: value }))
);
)
}

async mdelete(_keys: string[]): Promise<void> {
throw new Error("Not implemented.");
throw new Error("Not implemented.")
}

// eslint-disable-next-line require-yield
async *yieldKeys(_prefix?: string): AsyncGenerator<string> {
throw new Error("Not implemented");
throw new Error("Not implemented")
}

async deleteAll(): Promise<void> {
return new Promise((resolve, reject) => {
let cursor = '0';
let cursor = '0'

const scanCallback = (err, result) => {
if (err) {
reject(err);
return;
reject(err)
return
}

const [nextCursor, keys] = result;
const [nextCursor, keys] = result

// Delete keys matching the prefix
keys.forEach((key) => {
this._client.del(key);
});
this._client.del(key)
})

// If the cursor is '0', we've iterated through all keys
if (nextCursor === '0') {
resolve();
resolve()
} else {
// Continue scanning with the next cursor
this._client.scan(nextCursor, 'MATCH', `${this._namespace}:*`, scanCallback);
this._client.scan(nextCursor, 'MATCH', `${this._namespace}:*`, scanCallback)
}
};
}

// Start the initial SCAN operation
this._client.scan(cursor, 'MATCH', `${this._namespace}:*`, scanCallback);
});
this._client.scan(cursor, 'MATCH', `${this._namespace}:*`, scanCallback)
})
}
}
31 changes: 19 additions & 12 deletions server/utils/recursiveUrlLoader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo
const allLinks = Array.from(
new JSDOM(html).window.document.querySelectorAll('a')
).map((a) => a.href)

console.log("All Links: ", allLinks)

const absolutePaths = []
// eslint-disable-next-line no-script-url
const invalidPrefixes = ['javascript:', 'mailto:', '#']
Expand Down Expand Up @@ -119,11 +122,14 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo
this.excludeDirs.some((exDir) => standardizedLink.startsWith(exDir))
||
this.excludeGlobs.some(r => r.test(standardizedLink))
)
) {
console.log("URL excluded: ", standardizedLink)
continue
}

if (link.startsWith('http')) {
const isAllowed = !this.preventOutside || link.startsWith(baseUrl)
console.log(`URL ${link} is allowed: ${isAllowed}`)
if (isAllowed) absolutePaths.push(link)
} else if (link.startsWith('//')) {
const base = new URL(baseUrl)
Expand Down Expand Up @@ -180,10 +186,10 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo
visited: Set<string> = new Set<string>(),
depth = 0
): Promise<Document[]> {
console.log(`URL ${inputUrl} at depth ${depth}`)
if (depth >= this.maxDepth) return []

let url = inputUrl
if (!inputUrl.endsWith('/')) url += '/'
const url = inputUrl

const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir)) || this.excludeGlobs.some(r => r.test(url))
if (isExcluded) return []
Expand All @@ -198,6 +204,9 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo

const childUrls: string[] = this.getChildLinks(res, url)

console.log("Input URL: ", inputUrl)
console.log("Child URLs: ", childUrls)

const results = await Promise.all(
childUrls.map((childUrl) =>
(async () => {
Expand All @@ -207,16 +216,14 @@ export class RecursiveUrlLoader extends BaseDocumentLoader implements DocumentLo
const childDoc = await this.getUrlAsDoc(childUrl)
if (!childDoc) return null

if (childUrl.endsWith('/')) {
const childUrlResponses = await this.getChildUrlsRecursive(
childUrl,
visited,
depth + 1
)
return [childDoc, ...childUrlResponses]
}

return [childDoc]
const childUrlResponses = await this.getChildUrlsRecursive(
childUrl,
visited,
depth + 1
)

return [childDoc, ...childUrlResponses]
})()
)
)
Expand Down

0 comments on commit c69d846

Please sign in to comment.