Skip to content

Commit

Permalink
Update tine.no scraper (#486)
Browse files Browse the repository at this point in the history
* Add missing cuisine methods to abstract scraper and scraper templates

* Add category to scraper templates

* Add parser for Schema.org description

* Rename test_tine -> test_tineno

To conform with scraper generation from templates

* Update Tine.no scraper to use more robust Schema.org implementation

Also regenerated the scraper from templates to add some missing methods
  • Loading branch information
mathiazom authored Jan 25, 2022
1 parent 7052bb5 commit 836a664
Show file tree
Hide file tree
Showing 9 changed files with 1,490 additions and 3,719 deletions.
6 changes: 6 additions & 0 deletions recipe_scrapers/_abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ def ratings(self):
def author(self):
raise NotImplementedError("This should be implemented.")

def cuisine(self):
raise NotImplementedError("This should be implemented.")

def description(self):
raise NotImplementedError("This should be implemented.")

def reviews(self):
raise NotImplementedError("This should be implemented.")

Expand Down
3 changes: 3 additions & 0 deletions recipe_scrapers/_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def author(self):
def cuisine(self):
return self.schema.cuisine()

def description(self):
return self.schema.description()

@classmethod
def generate(cls, url, **options):
return cls.SchemaScraper(url, **options)
6 changes: 6 additions & 0 deletions recipe_scrapers/_schemaorg.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,9 @@ def cuisine(self):
elif isinstance(cuisine, list):
return ",".join(cuisine)
return cuisine

def description(self):
description = self.data.get("description")
if description is None:
raise SchemaOrgException("No description data in SchemaOrg.")
return normalize_string(description)
23 changes: 23 additions & 0 deletions recipe_scrapers/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,26 @@ def url_path_to_dict(path):

def get_host_name(url):
return url_path_to_dict(url.replace("://www.", "://"))["host"]


def change_keys(obj, convert):
"""
Recursively goes through the dictionary obj and replaces keys with the convert function
Useful for fixing incorrect property keys, e.g. in JSON-LD dictionaries
Credit: StackOverflow user 'baldr'
(https://web.archive.org/web/20201022163147/https://stackoverflow.com/questions/11700705/python-recursively-replace
-character-in-keys-of-nested-dictionary/33668421)
"""
if isinstance(obj, (str, int, float)):
return obj
if isinstance(obj, dict):
new = obj.__class__()
for k, v in obj.items():
new[convert(k)] = change_keys(v, convert)
elif isinstance(obj, (list, set, tuple)):
new = obj.__class__(change_keys(v, convert) for v in obj)
else:
return obj
return new
136 changes: 63 additions & 73 deletions recipe_scrapers/tineno.py
Original file line number Diff line number Diff line change
@@ -1,97 +1,87 @@
import re

from ._abstract import AbstractScraper
from ._utils import get_minutes, get_yields, normalize_string
from ._utils import change_keys


def fix_json_ld_property_keys(k):
if k == "ItemListElement":
return "itemListElement"
return k


class TineNo(AbstractScraper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Fix incorrect property naming
self.schema.data = change_keys(self.schema.data, fix_json_ld_property_keys)

@classmethod
def host(cls):
return "tine.no"

def author(self):
return self.schema.author()

def title(self):
return self.soup.find("h1").get_text()
return self.schema.title()

def category(self):
return self.schema.category()

def total_time(self):
total_time = 0
tt1 = self.soup.find("li", {"class": "m-recipe-overview__total-time"})
if tt1:
tt = tt1.find("span", {"class": "t-info"})
tt1 = normalize_string(tt.get_text())
tt2 = get_minutes(tt1)
if tt1 and (tt2 == 0):
total_time = tt1
else:
total_time = tt2
return total_time
return self.schema.total_time()

def yields(self):
recipe_yield = self.soup.find("input", {"id": "portions"})
if recipe_yield:
return recipe_yield["value"]
else:
return get_yields(
self.soup.find(
"div", {"class": "recipe-adjust-servings__original-serving"}
).get_text()
)
return self.schema.yields()

def image(self):
image = self.soup.find("img", {"id": "HeaderMediaContent"})
if image:
# tag = image.find('src')
src = image.get("src", None)

return src if image else None
if not image:
# No image available
return None
# Prefer data src set for higher image resolution
srcset_str = image.get("data-srcset", None)
if not srcset_str:
# Fallback to regular img source
return image.get("src", None)
# Convert source set string to cleaner list
srcset = srcset_str.split(",")
# Find image with the best resolution
max_res = None
max_res_src = None
res_pattern = re.compile(r"(\d+)w")
for src_str in srcset:
src = src_str.strip().split(" ")
res_match = res_pattern.findall(src[1])
res = None
if res_match:
try:
res = int(res_match[0])
except ValueError:
pass
if max_res is None or res > max_res:
max_res = res
max_res_src = src[0]
return max_res_src

def ingredients(self):
# site uses <section><section>...</section></section>
ingredientsOuter = self.soup.findAll(
"div", {"id": "ingredient-groups-container"}
)
ingGroup = []
ingredients1 = ingredientsOuter[0].findAll("section")
for ings in ingredients1:
ingredients = ings.findAll("section")
for ingredient in ingredients:
if len(ingredient) > 1:
try:
header = ingredient.find(
"h3",
{
"class": "t-tertiary-heading o-recipe-ingredients__sub-title"
},
).get_text()
except Exception:
header = ""
tablelines = ingredient.findAll("td")
lst = []
cntr = 0
tmplst = []
for item in tablelines:
tmplst.append(normalize_string(item.get_text(strip=True)))
cntr += 1
if cntr % 2 == 0:
lst.append(" ".join(tmplst))
tmplst = []
if header != "":
ingGroup.append(header)
ingGroup += lst
return ingGroup
return self.schema.ingredients()

def instructions(self):
instructions = self.soup.find("ol", {"class": "o-recipe-steps__step-groups"})
"""
Standard Schema.org implementation, except removes HowToSection with generic 'Oppskrift' header.
These are "pseudo" sections that are not rendered on the actual website.
"""
return "\n".join(
[i for i in self.schema.instructions().split("\n") if i != "Oppskrift"]
)

ins = instructions.findAll("li", {"class": "o-recipe-steps__step-group"})
def ratings(self):
return self.schema.ratings()

return "\n".join([normalize_string(inst.text) for inst in ins])
def cuisine(self):
return self.schema.cuisine()

def description(self):
d = normalize_string(
self.soup.find(
"div",
{
"class": "t-ingress m-page-ingress m-page-ingress--recipe l-recipe__ingress"
},
).text
)

return d if d else None
return self.schema.description()
9 changes: 9 additions & 0 deletions templates/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ def author(self):
def title(self):
return self.schema.title()

def category(self):
return self.schema.category()

def total_time(self):
return self.schema.total_time()

Expand All @@ -29,3 +32,9 @@ def instructions(self):

def ratings(self):
return self.schema.ratings()

def cuisine(self):
return self.schema.cuisine()

def description(self):
return self.schema.description()
9 changes: 9 additions & 0 deletions templates/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ def test_author(self):
def test_title(self):
self.assertEqual(None, self.harvester_class.title())

def test_category(self):
self.assertEqual(None, self.harvester_class.category())

def test_total_time(self):
self.assertEqual(None, self.harvester_class.total_time())

Expand All @@ -32,3 +35,9 @@ def test_instructions(self):

def test_ratings(self):
self.assertEqual(None, self.harvester_class.ratings())

def test_cuisine(self):
self.assertEqual(None, self.harvester_class.cuisine())

def test_description(self):
self.assertEqual(None, self.harvester_class.description())
4,972 changes: 1,346 additions & 3,626 deletions tests/test_data/tineno.testhtml

Large diffs are not rendered by default.

45 changes: 25 additions & 20 deletions tests/test_tine.py → tests/test_tineno.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@


class TestTineNoScraper(ScraperTest):

scraper_class = TineNo

def test_host(self):
Expand All @@ -15,14 +14,20 @@ def test_canonical_url(self):
self.harvester_class.canonical_url(),
)

def test_author(self):
self.assertEqual("TINE", self.harvester_class.author())

def test_title(self):
self.assertEqual(self.harvester_class.title(), "Rask kylling tikka masala")
self.assertEqual("Rask kylling tikka masala", self.harvester_class.title())

def test_category(self):
self.assertEqual("middag", self.harvester_class.category())

def test_total_time(self):
self.assertEqual(30, self.harvester_class.total_time())

def test_yields(self):
self.assertEqual("4", self.harvester_class.yields())
self.assertEqual("4 serving(s)", self.harvester_class.yields())

def test_image(self):
self.assertEqual(
Expand All @@ -31,44 +36,44 @@ def test_image(self):
)

def test_ingredients(self):
self.assertCountEqual(
self.assertEqual(
[
"Ris:",
"4 dl basmatiris",
"Tikka masala:",
"400 g kyllingfileter",
"1 ss TINE Meierismørtil steking",
"1 ss TINE® Meierismør til steking",
"1 stk paprika",
"½ dl chili",
"0.5 dl chili",
"3 stk vårløk",
"1 ts hvitløksfedd",
"1 ss hakket, friskingefær",
"½ dl hakket, friskkoriander",
"1 ss hakket, frisk ingefær",
"0.5 dl hakket, frisk koriander",
"2 ts garam masala",
"3 dl TINE Lett Crème Fraîche 18 %",
"3 dl TINE® Lett Crème Fraîche 18 %",
"3 ss tomatpuré",
"½ ts salt",
"¼ ts pepper",
"Raita:",
"½ dl slangeagurk",
"3 dl TINE Yoghurt Naturell",
"½ dl friskmynte",
"0.5 ts salt",
"0.25 ts pepper",
"0.5 dl slangeagurk",
"3 dl TINE® Yoghurt Naturell",
"0.5 dl frisk mynte",
"1 ts hvitløksfedd",
"½ ts salt",
"¼ ts pepper",
"0.5 ts salt",
"0.25 ts pepper",
],
self.harvester_class.ingredients(),
)

def test_instructions(self):
return self.assertEqual(
"Kok ris etter anvisningen på pakken.\nTikka masala: Del kylling i biter. Brun kyllingen i smør i en stekepanne på middels varme. Rens og hakk paprika, chili, vårløk og hvitløk og ha det i stekepannen sammen med kyllingen. Rens og finhakk ingefær og frisk koriander. Krydre med garam masala, koriander og ingefær. Hell i crème fraîche og tomatpuré, og la småkoke i 5 minutter. Smak til med salt og pepper.\nRaita: Riv agurk og bland den med yoghurt. Hakk mynte og hvitløk og bland det i. Smak til med salt og pepper.",
"Kok ris etter anvisningen på pakken.\nTikka masala:\nDel kylling i biter. Brun kyllingen i smør i en stekepanne på middels varme.\nRens og hakk paprika, chili, vårløk og hvitløk og ha det i stekepannen sammen med kyllingen. Rens og finhakk ingefær og frisk koriander. Krydre med garam masala, koriander og ingefær.\nHell i crème fraîche og tomatpuré, og la småkoke i 5 minutter. Smak til med salt og pepper.\nRaita:\nRiv agurk og bland den med yoghurt. Hakk mynte og hvitløk og bland det i. Smak til med salt og pepper.",
self.harvester_class.instructions(),
)

def test_ratings(self):
self.assertEqual(3.9, self.harvester_class.ratings())

def test_cuisine(self):
self.assertEqual("indisk", self.harvester_class.cuisine())

def test_description(self):
self.assertEqual(
"En god og rask oppskrift på en kylling tikka masala. Dette er en rett med små smakseksplosjoner som sender tankene til India.",
Expand Down

0 comments on commit 836a664

Please sign in to comment.