diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ab0556d..ac73b4e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ # [1.5.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.4.0...v1.5.0) (2024-07-05) + ### Features -- git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) +* git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) # [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15) diff --git a/config.ts b/config.ts index 6a24846b..f5c958df 100644 --- a/config.ts +++ b/config.ts @@ -6,5 +6,4 @@ export const defaultConfig: Config = { maxPagesToCrawl: 50, outputFileName: "output.json", maxTokens: 2000000, - // proxyUrls: ["http://username:password@proxyserver:port"], // socks5://username:password@proxyserver:port }; diff --git a/src/config.ts b/src/config.ts index 0e4f0159..787744ce 100644 --- a/src/config.ts +++ b/src/config.ts @@ -85,10 +85,6 @@ export const configSchema = z.object({ * @example 5000 */ maxTokens: z.number().int().positive().optional(), - /** Optional proxy server - * @example ['http://username:password@proxyserver:port', 'socks5://username:password@proxyserver:port'] - */ - proxyUrls: z.array(z.string()).optional(), }); export type Config = z.infer; diff --git a/src/core.ts b/src/core.ts index 2e19c4e0..c996f2bb 100644 --- a/src/core.ts +++ b/src/core.ts @@ -1,10 +1,5 @@ // For more information, see https://crawlee.dev/ -import { - Configuration, - PlaywrightCrawler, - ProxyConfiguration, - downloadListOfUrls, -} from "crawlee"; +import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { Config, configSchema } from "./config.js"; @@ -59,13 +54,8 @@ export async function crawl(config: Config) { if (process.env.NO_CRAWL !== "true") { // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. - const proxyConfiguration = new ProxyConfiguration({ - proxyUrls: config.proxyUrls, - }); - crawler = new PlaywrightCrawler( { - proxyConfiguration, // Use the requestHandler to process each of the crawled pages. async requestHandler({ request, page, enqueueLinks, log, pushData }) { const title = await page.title();