@@ -8,13 +8,16 @@ import { PlanType } from "../../types";
8
8
import { logger } from "../logger" ;
9
9
import { processUrl } from "./url-processor" ;
10
10
import { scrapeDocument } from "./document-scraper" ;
11
- import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract" ;
11
+ import {
12
+ generateOpenAICompletions ,
13
+ generateSchemaFromPrompt ,
14
+ } from "../../scraper/scrapeURL/transformers/llmExtract" ;
12
15
import { buildDocument } from "./build-document" ;
13
16
import { billTeam } from "../../services/billing/credit_billing" ;
14
17
import { logJob } from "../../services/logging/log_job" ;
15
18
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs" ;
16
19
import { saveCrawl , StoredCrawl } from "../crawl-redis" ;
17
- import { dereferenceSchema } from "./helpers/dereference-schema" ;
20
+ import { dereferenceSchema } from "./helpers/dereference-schema" ;
18
21
import { z } from "zod" ;
19
22
import OpenAI from "openai" ;
20
23
import { spreadSchemas } from "./helpers/spread-schemas" ;
@@ -45,7 +48,6 @@ interface ExtractResult {
45
48
error ?: string ;
46
49
}
47
50
48
-
49
51
async function analyzeSchemaAndPrompt (
50
52
urls : string [ ] ,
51
53
schema : any ,
@@ -56,6 +58,10 @@ async function analyzeSchemaAndPrompt(
56
58
reasoning ?: string ;
57
59
keyIndicators ?: string [ ] ;
58
60
} > {
61
+ if ( ! schema ) {
62
+ schema = await generateSchemaFromPrompt ( prompt ) ;
63
+ }
64
+
59
65
const schemaString = JSON . stringify ( schema ) ;
60
66
61
67
const checkSchema = z . object ( {
@@ -132,7 +138,7 @@ type completions = {
132
138
extract : Record < string , any > ;
133
139
numTokens : number ;
134
140
warning ?: string ;
135
- }
141
+ } ;
136
142
137
143
function getRootDomain ( url : string ) : string {
138
144
try {
@@ -186,20 +192,22 @@ export async function performExtraction(
186
192
includeSubdomains : request . includeSubdomains ,
187
193
schema : request . schema ,
188
194
} ,
189
- urlTraces ,
195
+ urlTraces ,
190
196
( links : string [ ] ) => {
191
197
aggMapLinks . push ( ...links ) ;
192
198
updateExtract ( extractId , {
193
199
steps : [
194
- {
195
- step : ExtractStep . MAP ,
196
- startedAt : startMap ,
197
- finishedAt : Date . now ( ) ,
198
- discoveredLinks : aggMapLinks ,
199
- } ,
200
- ] ,
201
- } ) ;
202
- } ) ) ;
200
+ {
201
+ step : ExtractStep . MAP ,
202
+ startedAt : startMap ,
203
+ finishedAt : Date . now ( ) ,
204
+ discoveredLinks : aggMapLinks ,
205
+ } ,
206
+ ] ,
207
+ } ) ;
208
+ } ,
209
+ ) ,
210
+ ) ;
203
211
204
212
const processedUrls = await Promise . all ( urlPromises ) ;
205
213
const links = processedUrls . flat ( ) . filter ( ( url ) => url ) ;
@@ -227,7 +235,13 @@ export async function performExtraction(
227
235
} ) ;
228
236
229
237
let reqSchema = request . schema ;
230
- reqSchema = await dereferenceSchema ( reqSchema ) ;
238
+ if ( ! reqSchema && request . prompt ) {
239
+ reqSchema = await generateSchemaFromPrompt ( request . prompt ) ;
240
+ }
241
+
242
+ if ( reqSchema ) {
243
+ reqSchema = await dereferenceSchema ( reqSchema ) ;
244
+ }
231
245
232
246
// agent evaluates if the schema or the prompt has an array with big amount of items
233
247
// also it checks if the schema any other properties that are not arrays
@@ -236,16 +250,19 @@ export async function performExtraction(
236
250
// 2. the second one is multiple completions that will extract the items from the array
237
251
let startAnalyze = Date . now ( ) ;
238
252
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
239
- await analyzeSchemaAndPrompt ( links , request . schema , request . prompt ?? "" ) ;
253
+ await analyzeSchemaAndPrompt ( links , reqSchema , request . prompt ?? "" ) ;
240
254
241
255
// console.log("\nIs Multi Entity:", isMultiEntity);
242
256
// console.log("\nMulti Entity Keys:", multiEntityKeys);
243
257
// console.log("\nReasoning:", reasoning);
244
258
// console.log("\nKey Indicators:", keyIndicators);
245
259
246
260
let rSchema = reqSchema ;
247
- if ( isMultiEntity ) {
248
- const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas ( reqSchema , multiEntityKeys )
261
+ if ( isMultiEntity && reqSchema ) {
262
+ const { singleAnswerSchema, multiEntitySchema } = await spreadSchemas (
263
+ reqSchema ,
264
+ multiEntityKeys ,
265
+ ) ;
249
266
rSchema = singleAnswerSchema ;
250
267
251
268
await updateExtract ( extractId , {
@@ -260,7 +277,6 @@ export async function performExtraction(
260
277
] ,
261
278
} ) ;
262
279
263
-
264
280
const timeout = Math . floor ( ( request . timeout || 40000 ) * 0.7 ) || 30000 ;
265
281
266
282
await updateExtract ( extractId , {
@@ -287,11 +303,11 @@ export async function performExtraction(
287
303
timeout,
288
304
} ,
289
305
urlTraces ,
290
- )
306
+ ) ;
291
307
}
292
308
return docsMap . get ( url ) ;
293
- } )
294
-
309
+ } ) ;
310
+
295
311
let multyEntityDocs = ( await Promise . all ( scrapePromises ) ) . filter (
296
312
( doc ) : doc is Document => doc !== null ,
297
313
) ;
@@ -315,7 +331,7 @@ export async function performExtraction(
315
331
docsMap . set ( doc . metadata . url , doc ) ;
316
332
}
317
333
}
318
-
334
+
319
335
// Process docs in chunks with queue style processing
320
336
const chunkSize = 50 ;
321
337
const timeoutCompletion = 45000 ; // 45 second timeout
@@ -331,7 +347,7 @@ export async function performExtraction(
331
347
const chunkPromises = chunk . map ( async ( doc ) => {
332
348
try {
333
349
ajv . compile ( multiEntitySchema ) ;
334
-
350
+
335
351
// Wrap in timeout promise
336
352
const timeoutPromise = new Promise ( ( resolve ) => {
337
353
setTimeout ( ( ) => resolve ( null ) , timeoutCompletion ) ;
@@ -342,25 +358,28 @@ export async function performExtraction(
342
358
logger . child ( { method : "extractService/checkShouldExtract" } ) ,
343
359
{
344
360
mode : "llm" ,
345
- systemPrompt : "You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt." ,
361
+ systemPrompt :
362
+ "You are a content relevance checker. Your job is to determine if the provided content is very relevant to extract information from based on the user's prompt. Return true only if the content appears relevant and contains information that could help answer the prompt. Return false if the content seems irrelevant or unlikely to contain useful information for the prompt." ,
346
363
prompt : `Should the following content be used to extract information for this prompt: "${ request . prompt } " User schema is: ${ JSON . stringify ( multiEntitySchema ) } \nReturn only true or false.` ,
347
364
schema : {
348
- " type" : "object" ,
349
- " properties" : {
350
- " extract" : {
351
- " type" : "boolean"
352
- }
365
+ type : "object" ,
366
+ properties : {
367
+ extract : {
368
+ type : "boolean" ,
369
+ } ,
353
370
} ,
354
- " required" : [ "extract" ]
355
- }
371
+ required : [ "extract" ] ,
372
+ } ,
356
373
} ,
357
374
buildDocument ( doc ) ,
358
375
undefined ,
359
- true
376
+ true ,
360
377
) ;
361
378
362
379
if ( ! shouldExtractCheck . extract [ "extract" ] ) {
363
- console . log ( `Skipping extraction for ${ doc . metadata . url } as content is irrelevant` ) ;
380
+ console . log (
381
+ `Skipping extraction for ${ doc . metadata . url } as content is irrelevant` ,
382
+ ) ;
364
383
return null ;
365
384
}
366
385
// Add confidence score to schema with 5 levels
@@ -369,11 +388,15 @@ export async function performExtraction(
369
388
properties : {
370
389
...multiEntitySchema . properties ,
371
390
is_content_relevant : {
372
- type : "boolean" ,
373
- description : "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information."
374
- }
391
+ type : "boolean" ,
392
+ description :
393
+ "Determine if this content is relevant to the prompt. Return true ONLY if the content contains information that directly helps answer the prompt. Return false if the content is irrelevant or unlikely to contain useful information." ,
394
+ } ,
375
395
} ,
376
- required : [ ...( multiEntitySchema . required || [ ] ) , "is_content_relevant" ]
396
+ required : [
397
+ ...( multiEntitySchema . required || [ ] ) ,
398
+ "is_content_relevant" ,
399
+ ] ,
377
400
} ;
378
401
// console.log("schemaWithConfidence", schemaWithConfidence);
379
402
@@ -384,15 +407,19 @@ export async function performExtraction(
384
407
step : ExtractStep . MULTI_ENTITY_EXTRACT ,
385
408
startedAt : startScrape ,
386
409
finishedAt : Date . now ( ) ,
387
- discoveredLinks : [ doc . metadata . url || doc . metadata . sourceURL || "" ] ,
410
+ discoveredLinks : [
411
+ doc . metadata . url || doc . metadata . sourceURL || "" ,
412
+ ] ,
388
413
} ,
389
414
] ,
390
415
} ) ;
391
416
392
417
const completionPromise = generateOpenAICompletions (
393
- logger . child ( { method : "extractService/generateOpenAICompletions" } ) ,
418
+ logger . child ( {
419
+ method : "extractService/generateOpenAICompletions" ,
420
+ } ) ,
394
421
{
395
- mode : "llm" ,
422
+ mode : "llm" ,
396
423
systemPrompt :
397
424
( request . systemPrompt ? `${ request . systemPrompt } \n` : "" ) +
398
425
`Always prioritize using the provided content to answer the question. Do not make up an answer. Do not hallucinate. Be concise and follow the schema always if provided. If the document provided is not relevant to the prompt nor to the final user schema ${ JSON . stringify ( multiEntitySchema ) } , return null. Here are the urls the user provided of which he wants to extract information from: ` +
@@ -406,10 +433,10 @@ export async function performExtraction(
406
433
) ;
407
434
408
435
// Race between timeout and completion
409
- const multiEntityCompletion = await Promise . race ( [
436
+ const multiEntityCompletion = ( await Promise . race ( [
410
437
completionPromise ,
411
- timeoutPromise
412
- ] ) as Awaited < ReturnType < typeof generateOpenAICompletions > > ;
438
+ timeoutPromise ,
439
+ ] ) ) as Awaited < ReturnType < typeof generateOpenAICompletions > > ;
413
440
414
441
// console.log(multiEntityCompletion.extract)
415
442
// if (!multiEntityCompletion.extract?.is_content_relevant) {
@@ -452,25 +479,36 @@ export async function performExtraction(
452
479
453
480
// Wait for current chunk to complete before processing next chunk
454
481
const chunkResults = await Promise . all ( chunkPromises ) ;
455
- multiEntityCompletions . push ( ...chunkResults . filter ( result => result !== null ) ) ;
482
+ multiEntityCompletions . push (
483
+ ...chunkResults . filter ( ( result ) => result !== null ) ,
484
+ ) ;
456
485
}
457
486
458
487
try {
459
- multiEntityResult = transformArrayToObject ( multiEntitySchema , multiEntityCompletions ) ;
488
+ multiEntityResult = transformArrayToObject (
489
+ multiEntitySchema ,
490
+ multiEntityCompletions ,
491
+ ) ;
460
492
multiEntityResult = deduplicateObjectsArray ( multiEntityResult ) ;
461
493
multiEntityResult = mergeNullValObjs ( multiEntityResult ) ;
462
494
// @nick : maybe we can add here a llm that checks if the array probably has a primary key?
463
495
} catch ( error ) {
464
496
logger . error ( `Failed to transform array to object: ${ error } ` ) ;
465
497
return {
466
498
success : false ,
467
- error :
"An unexpected error occurred. Please contact [email protected] for help." ,
499
+ error :
500
+ "An unexpected error occurred. Please contact [email protected] for help." ,
468
501
extractId,
469
502
urlTrace : urlTraces ,
470
503
} ;
471
504
}
472
505
}
473
- if ( rSchema && Object . keys ( rSchema ) . length > 0 && rSchema . properties && Object . keys ( rSchema . properties ) . length > 0 ) {
506
+ if (
507
+ rSchema &&
508
+ Object . keys ( rSchema ) . length > 0 &&
509
+ rSchema . properties &&
510
+ Object . keys ( rSchema . properties ) . length > 0
511
+ ) {
474
512
// Scrape documents
475
513
const timeout = Math . floor ( ( request . timeout || 40000 ) * 0.7 ) || 30000 ;
476
514
let singleAnswerDocs : Document [ ] = [ ] ;
@@ -513,7 +551,9 @@ export async function performExtraction(
513
551
}
514
552
}
515
553
516
- singleAnswerDocs . push ( ...results . filter ( ( doc ) : doc is Document => doc !== null ) ) ;
554
+ singleAnswerDocs . push (
555
+ ...results . filter ( ( doc ) : doc is Document => doc !== null ) ,
556
+ ) ;
517
557
} catch ( error ) {
518
558
return {
519
559
success : false ,
@@ -527,7 +567,8 @@ export async function performExtraction(
527
567
// All urls are invalid
528
568
return {
529
569
success : false ,
530
- error : "All provided URLs are invalid. Please check your input and try again." ,
570
+ error :
571
+ "All provided URLs are invalid. Please check your input and try again." ,
531
572
extractId,
532
573
urlTrace : request . urlTrace ? urlTraces : undefined ,
533
574
} ;
@@ -584,7 +625,9 @@ export async function performExtraction(
584
625
// }
585
626
}
586
627
587
- const finalResult = await mixSchemaObjects ( reqSchema , singleAnswerResult , multiEntityResult ) ;
628
+ const finalResult = reqSchema
629
+ ? await mixSchemaObjects ( reqSchema , singleAnswerResult , multiEntityResult )
630
+ : singleAnswerResult || multiEntityResult ;
588
631
589
632
let linksBilled = links . length * 5 ;
590
633
0 commit comments