From 74bee0500e1f34358797be004b86832f4bf93d1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20K=C5=99ivka?= Date: Wed, 6 Mar 2024 08:33:05 +0100 Subject: [PATCH] fix(academy-challenge): mention Playwright and residential proxies (#876) --- .../web_scraping_for_beginners/challenge/index.md | 2 ++ .../challenge/initializing_and_setting_up.md | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/sources/academy/webscraping/web_scraping_for_beginners/challenge/index.md b/sources/academy/webscraping/web_scraping_for_beginners/challenge/index.md index 2d842086a..65b77a450 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/challenge/index.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/challenge/index.md @@ -83,4 +83,6 @@ Each of the items in the dataset will represent a scraped offer, and will have t From this course, you should have all the knowledge to build this scraper by yourself. Give it a try, then come back to compare your scraper with our solution. +The challenge can be completed using either [CheerioCrawler](https://crawlee.dev/api/cheerio-crawler/class/CheerioCrawler) or [PlaywrightCrawler](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler). Playwright is significantly slower but doesn't get blocked as much. You will learn the most by implementing both. + Let's start off this section easy by [initializing and setting up](./initializing_and_setting_up.md) our project with the Crawlee CLI (don't worry, no additional install is required). diff --git a/sources/academy/webscraping/web_scraping_for_beginners/challenge/initializing_and_setting_up.md b/sources/academy/webscraping/web_scraping_for_beginners/challenge/initializing_and_setting_up.md index f2b8a046b..ffd1d26cd 100644 --- a/sources/academy/webscraping/web_scraping_for_beginners/challenge/initializing_and_setting_up.md +++ b/sources/academy/webscraping/web_scraping_for_beginners/challenge/initializing_and_setting_up.md @@ -33,6 +33,12 @@ const { keyword } = await KeyValueStore.getInput(); const crawler = new CheerioCrawler({ requestHandler: router, + + // If you have access to Apify Proxy, you can use residential proxies and + // high retry count which helps with blocking + // If you don't, your local IP address will likely be fine for a few requests if you scrape slowly. + // proxyConfiguration: await Actor.createProxyConfiguration({ groups: ['RESIDENTIAL'] }), + // maxRequestRetries: 10, }); log.info('Starting the crawl.');