Skip to content

Commit

Permalink
Merge pull request FlowiseAI#1566 from 0xi4o/feature/scrapped-links
Browse files Browse the repository at this point in the history
FEATURE: Select which links should be used in web scraper nodes (cheerio, puppeteer, and playwright)
  • Loading branch information
0xi4o authored Jan 25, 2024
2 parents eb4d545 + 94d8e00 commit 09d2b96
Show file tree
Hide file tree
Showing 7 changed files with 328 additions and 34 deletions.
32 changes: 22 additions & 10 deletions packages/components/nodes/documentloaders/Cheerio/Cheerio.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio'
import { test } from 'linkifyjs'
Expand Down Expand Up @@ -63,6 +63,7 @@ class Cheerio_DocumentLoaders implements INode {
name: 'limit',
type: 'number',
optional: true,
default: '10',
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
Expand All @@ -86,11 +87,12 @@ class Cheerio_DocumentLoaders implements INode {
]
}

async init(nodeData: INodeData): Promise<any> {
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)

let url = nodeData.inputs?.url as string
url = url.trim()
Expand All @@ -117,23 +119,33 @@ class Cheerio_DocumentLoaders implements INode {
}
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
if (process.env.DEBUG === 'true') options.logger.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
}
}

let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
selectedLinks && selectedLinks.length > 0
? selectedLinks.slice(0, limit)
: relativeLinksMethod === 'webCrawl'
? await webCrawl(url, limit)
: await xmlScrape(url, limit)
if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await cheerioLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
docs.push(...(await cheerioLoader(page)))
}
} else {
docs = await cheerioLoader(url)
}
Expand Down
32 changes: 22 additions & 10 deletions packages/components/nodes/documentloaders/Playwright/Playwright.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Browser, Page, PlaywrightWebBaseLoader, PlaywrightWebBaseLoaderOptions } from 'langchain/document_loaders/web/playwright'
import { test } from 'linkifyjs'
Expand Down Expand Up @@ -61,6 +61,7 @@ class Playwright_DocumentLoaders implements INode {
name: 'limit',
type: 'number',
optional: true,
default: '10',
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
Expand Down Expand Up @@ -114,11 +115,12 @@ class Playwright_DocumentLoaders implements INode {
]
}

async init(nodeData: INodeData): Promise<any> {
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
let waitForSelector = nodeData.inputs?.waitForSelector as string

Expand Down Expand Up @@ -158,23 +160,33 @@ class Playwright_DocumentLoaders implements INode {
}
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`)
if (process.env.DEBUG === 'true') options.logger.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`)
}
}

let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
selectedLinks && selectedLinks.length > 0
? selectedLinks.slice(0, limit)
: relativeLinksMethod === 'webCrawl'
? await webCrawl(url, limit)
: await xmlScrape(url, limit)
if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await playwrightLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
docs.push(...(await playwrightLoader(page)))
}
} else {
docs = await playwrightLoader(url)
}
Expand Down
32 changes: 22 additions & 10 deletions packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from 'langchain/document_loaders/web/puppeteer'
import { test } from 'linkifyjs'
Expand Down Expand Up @@ -62,6 +62,7 @@ class Puppeteer_DocumentLoaders implements INode {
name: 'limit',
type: 'number',
optional: true,
default: '10',
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
Expand Down Expand Up @@ -115,11 +116,12 @@ class Puppeteer_DocumentLoaders implements INode {
]
}

async init(nodeData: INodeData): Promise<any> {
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
let limit = parseInt(nodeData.inputs?.limit as string)
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
let waitForSelector = nodeData.inputs?.waitForSelector as string

Expand Down Expand Up @@ -159,23 +161,33 @@ class Puppeteer_DocumentLoaders implements INode {
}
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`)
if (process.env.DEBUG === 'true') options.logger.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`)
}
}

let docs = []
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
if (process.env.DEBUG === 'true') options.logger.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = 10
else if (limit < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
selectedLinks && selectedLinks.length > 0
? selectedLinks.slice(0, limit)
: relativeLinksMethod === 'webCrawl'
? await webCrawl(url, limit)
: await xmlScrape(url, limit)
if (process.env.DEBUG === 'true') options.logger.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await puppeteerLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
if (process.env.DEBUG === 'true') options.logger.info(`Finish ${relativeLinksMethod}`)
} else if (selectedLinks && selectedLinks.length > 0) {
if (process.env.DEBUG === 'true')
options.logger.info(`pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`)
for (const page of selectedLinks) {
docs.push(...(await puppeteerLoader(page)))
}
} else {
docs = await puppeteerLoader(url)
}
Expand Down
15 changes: 14 additions & 1 deletion packages/server/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ import { Tool } from './database/entities/Tool'
import { Assistant } from './database/entities/Assistant'
import { ChatflowPool } from './ChatflowPool'
import { CachePool } from './CachePool'
import { ICommonObject, IMessage, INodeOptionsValue, handleEscapeCharacters } from 'flowise-components'
import { ICommonObject, IMessage, INodeOptionsValue, handleEscapeCharacters, webCrawl, xmlScrape } from 'flowise-components'
import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit'
import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey'
import { sanitizeMiddleware } from './utils/XSS'
Expand Down Expand Up @@ -1117,6 +1117,19 @@ export class App {
}
})

// ----------------------------------------
// Scraper
// ----------------------------------------

this.app.get('/api/v1/fetch-links', async (req: Request, res: Response) => {
const url = decodeURIComponent(req.query.url as string)
const relativeLinksMethod = req.query.relativeLinksMethod as string
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
const links: string[] = relativeLinksMethod === 'webCrawl' ? await webCrawl(url, 0) : await xmlScrape(url, 0)

res.json({ status: 'OK', links })
})

// ----------------------------------------
// Upsert
// ----------------------------------------
Expand Down
8 changes: 8 additions & 0 deletions packages/ui/src/api/scraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import client from './client'

const fetchAllLinks = (url, relativeLinksMethod) =>
client.get(`/fetch-links?url=${encodeURIComponent(url)}&relativeLinksMethod=${relativeLinksMethod}`)

export default {
fetchAllLinks
}
Loading

0 comments on commit 09d2b96

Please sign in to comment.