add HAR example

reworkd · May 20, 2024 · e9c51ff · e9c51ff
1 parent 7b91448
commit e9c51ff
Showing 1 changed file with 59 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ for both manual and automatically created web extractors
 - [Example Scraper](#example-scraper)
   - [Detail Only Scraper](#detail-only-scraper)
   - [Listing Scraper](#listing-scraper)
+  - [Using Cache](#using-cache)
 - [Running a Scraper](#running-a-scraper)
 - [Submitting a PR](#submitting-a-pr)
 ---
@@ -147,6 +148,64 @@ if __name__ == "__main__":
 ```
 
 
+#### Using Cache
+The code below is an example detail scraper that relies on HAR cache
+that it creates during initial run, subsequently using it as source
+of data to improve speed and consume less bandwidth.
+
+```python
+import asyncio
+import os.path
+from typing import Any
+
+from playwright.async_api import Page
+
+from harambe import SDK
+from harambe import PlaywrightUtils as Pu
+
+HAR_FILE_PATH = "bananas.har"
+SELECTORS = {
+    "last_page": "",
+    "list_view": "//div[@class='et_pb_blurb_content']",
+    "name": "//h4/*[self::span or self::a]",
+    "fax": ">Fax.*?strong>(.*?)<br>",
+    # etc...
+}
+
+
+async def setup(sdk: SDK) -> None:
+    page: Page = sdk.page
+
+    already_cached = os.path.isfile(HAR_FILE_PATH)
+
+    if already_cached:
+        await page.route_from_har(HAR_FILE_PATH, not_found="fallback")
+    else:
+        await page.route_from_har(HAR_FILE_PATH, not_found="fallback", update=True)
+
+
+# Annotation registers the scraper with the SDK
+@SDK.scraper(domain="https://apprhs.org/our-locations/", stage="detail")
+async def scrape(sdk: SDK, url: str, *args: Any, **kwargs: Any) -> None:
+    page: Page = sdk.page
+
+    locations = await page.locator(SELECTORS["list_view"]).all()
+    for location in locations:
+        # Save the data to the database or file
+        await sdk.save_data(
+            {
+                "name": await Pu.get_text(location, SELECTORS["name"]),
+                "fax": await Pu.parse_by_regex(location, SELECTORS["fax"]),
+                # etc...
+            }
+        )
+
+
+if __name__ == "__main__":
+    asyncio.run(SDK.run(scrape, "https://apprhs.org/our-locations/", setup=setup))
+```
+
+
 ## Running a Scraper
 You can use poetry to run a scraper. The `run` command takes the
 scraper function and the url to scrape. The `run_from_file` command