Skip to content

Commit

Permalink
push commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Aure7138 committed Jul 20, 2024
1 parent b62a002 commit af64584
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 1 deletion.
1 change: 1 addition & 0 deletions config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ export const defaultConfig: Config = {
maxPagesToCrawl: 50,
outputFileName: "output.json",
maxTokens: 2000000,
// proxyUrls: ["http://username:password@proxyserver:port"], // socks5://username:password@proxyserver:port
};
4 changes: 4 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ export const configSchema = z.object({
* @example 5000
*/
maxTokens: z.number().int().positive().optional(),
/** Optional proxy server
* @example ['http://username:password@proxyserver:port', 'socks5://username:password@proxyserver:port']
*/
proxyUrls: z.array(z.string()).optional(),
});

export type Config = z.infer<typeof configSchema>;
7 changes: 6 additions & 1 deletion src/core.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// For more information, see https://crawlee.dev/
import { Configuration, PlaywrightCrawler, downloadListOfUrls } from "crawlee";
import { Configuration, PlaywrightCrawler, ProxyConfiguration, downloadListOfUrls } from "crawlee";
import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import { Config, configSchema } from "./config.js";
Expand Down Expand Up @@ -54,8 +54,13 @@ export async function crawl(config: Config) {
if (process.env.NO_CRAWL !== "true") {
// PlaywrightCrawler crawls the web using a headless
// browser controlled by the Playwright library.
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: config.proxyUrls,
});

crawler = new PlaywrightCrawler(
{
proxyConfiguration,
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
const title = await page.title();
Expand Down

0 comments on commit af64584

Please sign in to comment.