feat: allow scrape of html (#540)

* implement html scraper option * use html option for tests * potential wild-mode fix * pass string instead of TextIOWrapper * document scrape_html
hhursev · May 23, 2022 · 0073d3c · 0073d3c
1 parent 9ab2d1d
commit 0073d3c
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 7 deletions.
diff --git a/recipe_scrapers/__init__.py b/recipe_scrapers/__init__.py
@@ -1,3 +1,7 @@
+import contextlib
+from typing import Optional
+
+from ._abstract import AbstractScraper
 from ._exceptions import NoSchemaFoundInWildMode, WebsiteNotImplementedError
 from ._factory import SchemaScraperFactory
 from ._utils import get_host_name
@@ -417,5 +421,39 @@ def scrape_me(url_path, **options):
     return scraper(url_path, **options)
 
 
-__all__ = ["scrape_me"]
+def scrape_html(html: str, org_url: Optional[str] = None, **options) -> AbstractScraper:
+    """
+    takes a string of html and returns a scraper object. if the org_url is specified
+    then the scraper will use that url to resolve a defined scraper, otherwise it will
+    fall back to wild mode. If no schema is found in wild mode then a
+    NoSchemaFoundInWildMode exception will be raised.
+
+    Args:
+        html (str): raw HTML in text form
+        org_url (Optional[str], optional): Original URL of the HTML. Defaults to None.
+
+    Raises:
+        NoSchemaFoundInWildMode: If no schema is found in wild mode.
+
+    Returns:
+        AbstractScraper:
+    """
+    host_name = get_host_name(org_url) if org_url else None
+
+    if host_name:
+        with contextlib.suppress(KeyError):
+            scraper = SCRAPERS[host_name]
+
+    if not scraper:
+        wild_scraper = SchemaScraperFactory.generate(url=org_url, html=html, **options)
+
+        if not wild_scraper.schema.data:
+            raise NoSchemaFoundInWildMode(org_url)
+
+        return wild_scraper
+
+    return scraper(url=org_url, html=html, **options)
+
+
+__all__ = ["scrape_me", "scrape_html"]
 name = "recipe_scrapers"
diff --git a/recipe_scrapers/_abstract.py b/recipe_scrapers/_abstract.py
@@ -25,10 +25,10 @@ def __init__(
             Union[float, Tuple, None]
         ] = None,  # allows us to specify optional timeout for request
         wild_mode: Optional[bool] = False,
+        html: Union[str, None] = None,
     ):
-        if settings.TEST_MODE:  # when testing, we load a file
-            self.page_data = url.read()
-            url = "https://test.example.com/"
+        if html:
+            self.page_data = html
         else:
             self.page_data = requests.get(
                 url, headers=HEADERS, proxies=proxies, timeout=timeout

diff --git a/recipe_scrapers/kptncook.py b/recipe_scrapers/kptncook.py
@@ -18,7 +18,8 @@
 class KptnCook(AbstractScraper):
     def __init__(self, url, *args, **kwargs):
         if settings.TEST_MODE:  # pragma: no cover
-            self.recipe_json = json.loads(url.read())[0]
+            html = kwargs["html"]
+            self.recipe_json = json.loads(html)[0]
             self.lang = KPTN_DEFAULT_LANGUAGE
             self.final_url = "https://mobile.kptncook.com/recipe/pinterest/Low-Carb-Tarte-Flamb%C3%A9e-with-Serrano-Ham-%26-Cream-Cheese/315c3c32?lang=en"
         else:

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -25,7 +25,9 @@ def setUp(self):
             f"tests/test_data/{test_file_name}.{self.test_file_extension}",
             encoding="utf-8",
         ) as testfile:
-            self.harvester_class = self.scraper_class(testfile)
+            self.harvester_class = self.scraper_class(
+                url="https://test.example.com/", html=testfile.read()
+            )
             canonical_url = self.harvester_class.canonical_url()
             if self.online:
                 if not canonical_url:

diff --git a/tests/test_wild_mode.py b/tests/test_wild_mode.py
@@ -8,7 +8,9 @@ class TestWildMode(ScraperTest):
 
     def setUp(self):
         with open("tests/test_data/wild_mode.testhtml", encoding="utf-8") as testfile:
-            self.harvester_class = self.scraper_class.generate(testfile)
+            self.harvester_class = self.scraper_class.generate(
+                url="https://test.example.com/", html=testfile.read()
+            )
 
     def test_host(self):
         # let this one pass