Skip to content

Commit

Permalink
feat: allow scrape of html (#540)
Browse files Browse the repository at this point in the history
* implement html scraper option

* use html option for tests

* potential wild-mode fix

* pass string instead of TextIOWrapper

* document scrape_html
  • Loading branch information
hay-kot authored May 23, 2022
1 parent 9ab2d1d commit 0073d3c
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 7 deletions.
40 changes: 39 additions & 1 deletion recipe_scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import contextlib
from typing import Optional

from ._abstract import AbstractScraper
from ._exceptions import NoSchemaFoundInWildMode, WebsiteNotImplementedError
from ._factory import SchemaScraperFactory
from ._utils import get_host_name
Expand Down Expand Up @@ -417,5 +421,39 @@ def scrape_me(url_path, **options):
return scraper(url_path, **options)


__all__ = ["scrape_me"]
def scrape_html(html: str, org_url: Optional[str] = None, **options) -> AbstractScraper:
"""
takes a string of html and returns a scraper object. if the org_url is specified
then the scraper will use that url to resolve a defined scraper, otherwise it will
fall back to wild mode. If no schema is found in wild mode then a
NoSchemaFoundInWildMode exception will be raised.
Args:
html (str): raw HTML in text form
org_url (Optional[str], optional): Original URL of the HTML. Defaults to None.
Raises:
NoSchemaFoundInWildMode: If no schema is found in wild mode.
Returns:
AbstractScraper:
"""
host_name = get_host_name(org_url) if org_url else None

if host_name:
with contextlib.suppress(KeyError):
scraper = SCRAPERS[host_name]

if not scraper:
wild_scraper = SchemaScraperFactory.generate(url=org_url, html=html, **options)

if not wild_scraper.schema.data:
raise NoSchemaFoundInWildMode(org_url)

return wild_scraper

return scraper(url=org_url, html=html, **options)


__all__ = ["scrape_me", "scrape_html"]
name = "recipe_scrapers"
6 changes: 3 additions & 3 deletions recipe_scrapers/_abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ def __init__(
Union[float, Tuple, None]
] = None, # allows us to specify optional timeout for request
wild_mode: Optional[bool] = False,
html: Union[str, None] = None,
):
if settings.TEST_MODE: # when testing, we load a file
self.page_data = url.read()
url = "https://test.example.com/"
if html:
self.page_data = html
else:
self.page_data = requests.get(
url, headers=HEADERS, proxies=proxies, timeout=timeout
Expand Down
3 changes: 2 additions & 1 deletion recipe_scrapers/kptncook.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
class KptnCook(AbstractScraper):
def __init__(self, url, *args, **kwargs):
if settings.TEST_MODE: # pragma: no cover
self.recipe_json = json.loads(url.read())[0]
html = kwargs["html"]
self.recipe_json = json.loads(html)[0]
self.lang = KPTN_DEFAULT_LANGUAGE
self.final_url = "https://mobile.kptncook.com/recipe/pinterest/Low-Carb-Tarte-Flamb%C3%A9e-with-Serrano-Ham-%26-Cream-Cheese/315c3c32?lang=en"
else:
Expand Down
4 changes: 3 additions & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ def setUp(self):
f"tests/test_data/{test_file_name}.{self.test_file_extension}",
encoding="utf-8",
) as testfile:
self.harvester_class = self.scraper_class(testfile)
self.harvester_class = self.scraper_class(
url="https://test.example.com/", html=testfile.read()
)
canonical_url = self.harvester_class.canonical_url()
if self.online:
if not canonical_url:
Expand Down
4 changes: 3 additions & 1 deletion tests/test_wild_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ class TestWildMode(ScraperTest):

def setUp(self):
with open("tests/test_data/wild_mode.testhtml", encoding="utf-8") as testfile:
self.harvester_class = self.scraper_class.generate(testfile)
self.harvester_class = self.scraper_class.generate(
url="https://test.example.com/", html=testfile.read()
)

def test_host(self):
# let this one pass
Expand Down

0 comments on commit 0073d3c

Please sign in to comment.