diff --git a/apps/api/requests.http b/apps/api/requests.http index 751ba5eed..495df975d 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -13,15 +13,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website -POST https://api.firecrawl.dev/v0/scrape HTTP/1.1 +POST http://localhost:3002/v0/crawl HTTP/1.1 Authorization: Bearer content-type: application/json { - "url":"https://www.mendable.ai" + "url":"https://www.mendable.ai", + "crawlerOptions": { + "returnOnlyUrls": true + } } + + + + + + ### Scrape Website POST http://localhost:3002/v0/scrape HTTP/1.1 Authorization: Bearer @@ -34,7 +43,7 @@ content-type: application/json ### Check Job Status -GET http://localhost:3002/v0/crawl/status/333ab225-dc3e-418b-9d4b-8fb833cbaf89 HTTP/1.1 +GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 Authorization: Bearer ### Get Job Result diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index fdc1c6136..1144c63de 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -38,6 +38,10 @@ export type WebScraperOptions = { concurrentRequests?: number; }; +export interface DocumentUrl { + url: string; +} + export class Document { id?: string; url?: string; // Used only in /search for now diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index d94342910..0e44310b5 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -1,9 +1,10 @@ import { Job } from "bull"; import { CrawlResult, WebScraperOptions } from "../types"; import { WebScraperDataProvider } from "../scraper/WebScraper"; -import { Progress } from "../lib/entities"; +import { DocumentUrl, Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../lib/entities"; + export async function startWebScraperPipeline({ job, }: { @@ -47,7 +48,7 @@ export async function runWebScraper({ }): Promise<{ success: boolean; message: string; - docs: CrawlResult[]; + docs: Document[] | DocumentUrl[]; }> { try { const provider = new WebScraperDataProvider(); @@ -68,7 +69,7 @@ export async function runWebScraper({ } const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); - })) as CrawlResult[]; + })) as Document[]; if (docs.length === 0) { return { @@ -79,7 +80,14 @@ export async function runWebScraper({ } // remove docs with empty content - const filteredDocs = docs.filter((doc) => doc.content.trim().length > 0); + const filteredDocs = crawlerOptions.returnOnlyUrls + ? docs.map((doc) => { + if (doc.metadata.sourceURL) { + return { url: doc.metadata.sourceURL }; + } + }) + : docs.filter((doc) => doc.content.trim().length > 0); + const { success, credit_usage } = await billTeam( team_id, diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index aee9fb8a4..1904ef995 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -80,11 +80,16 @@ export class WebScraperDataProvider { }); let links = await crawler.start(inProgress, 5, this.limit); if (this.returnOnlyUrls) { + inProgress({ + current: links.length, + total: links.length, + status: "COMPLETED", + currentDocumentUrl: this.urls[0], + }); return links.map((url) => ({ content: "", + markdown: "", metadata: { sourceURL: url }, - provider: "web", - type: "text", })); }