Skip to content

Commit 957eea4

Browse files
committed
Nick: extract without a schema should work as expected
1 parent 61e6af2 commit 957eea4

File tree

3 files changed

+152
-53
lines changed

3 files changed

+152
-53
lines changed

apps/api/src/lib/extract/extract-redis.ts

-2
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@ export enum ExtractStep {
99
MULTI_ENTITY_SCRAPE = "multi-entity-scrape",
1010
MULTI_ENTITY_EXTRACT = "multi-entity-extract",
1111
SCRAPE = "scrape",
12-
1312
EXTRACT = "extract",
14-
1513
COMPLETE = "complete",
1614
}
1715

apps/api/src/lib/extract/extraction-service.ts

+94-51
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,16 @@ import { PlanType } from "../../types";
88
import { logger } from "../logger";
99
import { processUrl } from "./url-processor";
1010
import { scrapeDocument } from "./document-scraper";
11-
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
11+
import {
12+
generateOpenAICompletions,
13+
generateSchemaFromPrompt,
14+
} from "../../scraper/scrapeURL/transformers/llmExtract";
1215
import { buildDocument } from "./build-document";
1316
import { billTeam } from "../../services/billing/credit_billing";
1417
import { logJob } from "../../services/logging/log_job";
1518
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
1619
import { saveCrawl, StoredCrawl } from "../crawl-redis";
17-
import { dereferenceSchema } from "./helpers/dereference-schema";
20+
import { dereferenceSchema } from "./helpers/dereference-schema";
1821
import { z } from "zod";
1922
import OpenAI from "openai";
2023
import { spreadSchemas } from "./helpers/spread-schemas";
@@ -45,7 +48,6 @@ interface ExtractResult {
4548
error?: string;
4649
}
4750

48-
4951
async function analyzeSchemaAndPrompt(
5052
urls: string[],
5153
schema: any,
@@ -56,6 +58,10 @@ async function analyzeSchemaAndPrompt(
5658
reasoning?: string;
5759
keyIndicators?: string[];
5860
}> {
61+
if (!schema) {
62+
schema = await generateSchemaFromPrompt(prompt);
63+
}
64+
5965
const schemaString = JSON.stringify(schema);
6066

6167
const checkSchema = z.object({
@@ -132,7 +138,7 @@ type completions = {
132138
extract: Record<string, any>;
133139
numTokens: number;
134140
warning?: string;
135-
}
141+
};
136142

137143
function getRootDomain(url: string): string {
138144
try {
@@ -186,20 +192,22 @@ export async function performExtraction(
186192
includeSubdomains: request.includeSubdomains,
187193
schema: request.schema,
188194
},
189-
urlTraces,
195+
urlTraces,
190196
(links: string[]) => {
191197
aggMapLinks.push(...links);
192198
updateExtract(extractId, {
193199
steps: [
194-
{
195-
step: ExtractStep.MAP,
196-
startedAt: startMap,
197-
finishedAt: Date.now(),
198-
discoveredLinks: aggMapLinks,
199-
},
200-
],
201-
});
202-
}));
200+
{
201+
step: ExtractStep.MAP,
202+
startedAt: startMap,
203+
finishedAt: Date.now(),
204+
discoveredLinks: aggMapLinks,
205+
},
206+
],
207+
});
208+
},
209+
),
210+
);
203211

204212
const processedUrls = await Promise.all(urlPromises);
205213
const links = processedUrls.flat().filter((url) => url);
@@ -227,7 +235,13 @@ export async function performExtraction(
227235
});
228236

229237
let reqSchema = request.schema;
230-
reqSchema = await dereferenceSchema(reqSchema);
238+
if (!reqSchema && request.prompt) {
239+
reqSchema = await generateSchemaFromPrompt(request.prompt);
240+
}
241+
242+
if (reqSchema) {
243+
reqSchema = await dereferenceSchema(reqSchema);
244+
}
231245

232246
// agent evaluates if the schema or the prompt has an array with big amount of items
233247
// also it checks if the schema any other properties that are not arrays
@@ -236,16 +250,19 @@ export async function performExtraction(
236250
// 2. the second one is multiple completions that will extract the items from the array
237251
let startAnalyze = Date.now();
238252
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
239-
await analyzeSchemaAndPrompt(links, request.schema, request.prompt ?? "");
253+
await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
240254

241255
// console.log("\nIs Multi Entity:", isMultiEntity);
242256
// console.log("\nMulti Entity Keys:", multiEntityKeys);
243257
// console.log("\nReasoning:", reasoning);
244258
// console.log("\nKey Indicators:", keyIndicators);
245259

246260
let rSchema = reqSchema;
247-
if (isMultiEntity) {
248-
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(reqSchema, multiEntityKeys)
261+
if (isMultiEntity && reqSchema) {
262+
const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas(
263+
reqSchema,
264+
multiEntityKeys,
265+
);
249266
rSchema = singleAnswerSchema;
250267

251268
await updateExtract(extractId, {
@@ -260,7 +277,6 @@ export async function performExtraction(
260277
],
261278
});
262279

263-
264280
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
265281

266282
await updateExtract(extractId, {
@@ -287,11 +303,11 @@ export async function performExtraction(
287303
timeout,
288304
},
289305
urlTraces,
290-
)
306+
);
291307
}
292308
return docsMap.get(url);
293-
})
294-
309+
});
310+
295311
let multyEntityDocs = (await Promise.all(scrapePromises)).filter(
296312
(doc): doc is Document => doc !== null,
297313
);
@@ -315,7 +331,7 @@ export async function performExtraction(
315331
docsMap.set(doc.metadata.url, doc);
316332
}
317333
}
318-
334+
319335
// Process docs in chunks with queue style processing
320336
const chunkSize = 50;
321337
const timeoutCompletion = 45000; // 45 second timeout
@@ -331,7 +347,7 @@ export async function performExtraction(
331347
const chunkPromises = chunk.map(async (doc) => {
332348
try {
333349
ajv.compile(multiEntitySchema);
334-
350+
335351
// Wrap in timeout promise
336352
const timeoutPromise = new Promise((resolve) => {
337353
setTimeout(() => resolve(null), timeoutCompletion);
@@ -342,25 +358,28 @@ export async function performExtraction(
342358
logger.child({ method: "extractService/checkShouldExtract" }),
343359
{
344360
mode: "llm",
345-
systemPrompt: "You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.",
361+
systemPrompt:
362+
"You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt.",
346363
prompt: `Should the following content be used to extract information for this prompt: "${request.prompt}" User schema is: ${JSON.stringify(multiEntitySchema)}\nReturn only true or false.`,
347364
schema: {
348-
"type": "object",
349-
"properties": {
350-
"extract": {
351-
"type": "boolean"
352-
}
365+
type: "object",
366+
properties: {
367+
extract: {
368+
type: "boolean",
369+
},
353370
},
354-
"required": ["extract"]
355-
}
371+
required: ["extract"],
372+
},
356373
},
357374
buildDocument(doc),
358375
undefined,
359-
true
376+
true,
360377
);
361378

362379
if (!shouldExtractCheck.extract["extract"]) {
363-
console.log(`Skipping extraction for ${doc.metadata.url} as content is irrelevant`);
380+
console.log(
381+
`Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
382+
);
364383
return null;
365384
}
366385
// Add confidence score to schema with 5 levels
@@ -369,11 +388,15 @@ export async function performExtraction(
369388
properties: {
370389
...multiEntitySchema.properties,
371390
is_content_relevant: {
372-
type: "boolean",
373-
description: "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information."
374-
}
391+
type: "boolean",
392+
description:
393+
"Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information.",
394+
},
375395
},
376-
required: [...(multiEntitySchema.required || []), "is_content_relevant"]
396+
required: [
397+
...(multiEntitySchema.required || []),
398+
"is_content_relevant",
399+
],
377400
};
378401
// console.log("schemaWithConfidence", schemaWithConfidence);
379402

@@ -384,15 +407,19 @@ export async function performExtraction(
384407
step: ExtractStep.MULTI_ENTITY_EXTRACT,
385408
startedAt: startScrape,
386409
finishedAt: Date.now(),
387-
discoveredLinks: [doc.metadata.url || doc.metadata.sourceURL || ""],
410+
discoveredLinks: [
411+
doc.metadata.url || doc.metadata.sourceURL || "",
412+
],
388413
},
389414
],
390415
});
391416

392417
const completionPromise = generateOpenAICompletions(
393-
logger.child({ method: "extractService/generateOpenAICompletions" }),
418+
logger.child({
419+
method: "extractService/generateOpenAICompletions",
420+
}),
394421
{
395-
mode: "llm",
422+
mode: "llm",
396423
systemPrompt:
397424
(request.systemPrompt ? `${request.systemPrompt}\n` : "") +
398425
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${JSON.stringify(multiEntitySchema)}, return null. Here are the urls the user provided of which he wants to extract information from: ` +
@@ -406,10 +433,10 @@ export async function performExtraction(
406433
);
407434

408435
// Race between timeout and completion
409-
const multiEntityCompletion = await Promise.race([
436+
const multiEntityCompletion = (await Promise.race([
410437
completionPromise,
411-
timeoutPromise
412-
]) as Awaited<ReturnType<typeof generateOpenAICompletions>>;
438+
timeoutPromise,
439+
])) as Awaited<ReturnType<typeof generateOpenAICompletions>>;
413440

414441
// console.log(multiEntityCompletion.extract)
415442
// if (!multiEntityCompletion.extract?.is_content_relevant) {
@@ -452,25 +479,36 @@ export async function performExtraction(
452479

453480
// Wait for current chunk to complete before processing next chunk
454481
const chunkResults = await Promise.all(chunkPromises);
455-
multiEntityCompletions.push(...chunkResults.filter(result => result !== null));
482+
multiEntityCompletions.push(
483+
...chunkResults.filter((result) => result !== null),
484+
);
456485
}
457486

458487
try {
459-
multiEntityResult = transformArrayToObject(multiEntitySchema, multiEntityCompletions);
488+
multiEntityResult = transformArrayToObject(
489+
multiEntitySchema,
490+
multiEntityCompletions,
491+
);
460492
multiEntityResult = deduplicateObjectsArray(multiEntityResult);
461493
multiEntityResult = mergeNullValObjs(multiEntityResult);
462494
// @nick: maybe we can add here a llm that checks if the array probably has a primary key?
463495
} catch (error) {
464496
logger.error(`Failed to transform array to object: ${error}`);
465497
return {
466498
success: false,
467-
error: "An unexpected error occurred. Please contact [email protected] for help.",
499+
error:
500+
"An unexpected error occurred. Please contact [email protected] for help.",
468501
extractId,
469502
urlTrace: urlTraces,
470503
};
471504
}
472505
}
473-
if (rSchema && Object.keys(rSchema).length > 0 && rSchema.properties && Object.keys(rSchema.properties).length > 0) {
506+
if (
507+
rSchema &&
508+
Object.keys(rSchema).length > 0 &&
509+
rSchema.properties &&
510+
Object.keys(rSchema.properties).length > 0
511+
) {
474512
// Scrape documents
475513
const timeout = Math.floor((request.timeout || 40000) * 0.7) || 30000;
476514
let singleAnswerDocs: Document[] = [];
@@ -513,7 +551,9 @@ export async function performExtraction(
513551
}
514552
}
515553

516-
singleAnswerDocs.push(...results.filter((doc): doc is Document => doc !== null));
554+
singleAnswerDocs.push(
555+
...results.filter((doc): doc is Document => doc !== null),
556+
);
517557
} catch (error) {
518558
return {
519559
success: false,
@@ -527,7 +567,8 @@ export async function performExtraction(
527567
// All urls are invalid
528568
return {
529569
success: false,
530-
error: "All provided URLs are invalid. Please check your input and try again.",
570+
error:
571+
"All provided URLs are invalid. Please check your input and try again.",
531572
extractId,
532573
urlTrace: request.urlTrace ? urlTraces : undefined,
533574
};
@@ -584,7 +625,9 @@ export async function performExtraction(
584625
// }
585626
}
586627

587-
const finalResult = await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult);
628+
const finalResult = reqSchema
629+
? await mixSchemaObjects(reqSchema, singleAnswerResult, multiEntityResult)
630+
: singleAnswerResult || multiEntityResult;
588631

589632
let linksBilled = links.length * 5;
590633

0 commit comments

Comments
 (0)