-
Notifications
You must be signed in to change notification settings - Fork 524
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add missing cuisine methods to abstract scraper and scraper templates * Add category to scraper templates * Add parser for Schema.org description * Rename test_tine -> test_tineno To conform with scraper generation from templates * Update Tine.no scraper to use more robust Schema.org implementation Also regenerated the scraper from templates to add some missing methods
- Loading branch information
Showing
9 changed files
with
1,490 additions
and
3,719 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,97 +1,87 @@ | ||
import re | ||
|
||
from ._abstract import AbstractScraper | ||
from ._utils import get_minutes, get_yields, normalize_string | ||
from ._utils import change_keys | ||
|
||
|
||
def fix_json_ld_property_keys(k): | ||
if k == "ItemListElement": | ||
return "itemListElement" | ||
return k | ||
|
||
|
||
class TineNo(AbstractScraper): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
# Fix incorrect property naming | ||
self.schema.data = change_keys(self.schema.data, fix_json_ld_property_keys) | ||
|
||
@classmethod | ||
def host(cls): | ||
return "tine.no" | ||
|
||
def author(self): | ||
return self.schema.author() | ||
|
||
def title(self): | ||
return self.soup.find("h1").get_text() | ||
return self.schema.title() | ||
|
||
def category(self): | ||
return self.schema.category() | ||
|
||
def total_time(self): | ||
total_time = 0 | ||
tt1 = self.soup.find("li", {"class": "m-recipe-overview__total-time"}) | ||
if tt1: | ||
tt = tt1.find("span", {"class": "t-info"}) | ||
tt1 = normalize_string(tt.get_text()) | ||
tt2 = get_minutes(tt1) | ||
if tt1 and (tt2 == 0): | ||
total_time = tt1 | ||
else: | ||
total_time = tt2 | ||
return total_time | ||
return self.schema.total_time() | ||
|
||
def yields(self): | ||
recipe_yield = self.soup.find("input", {"id": "portions"}) | ||
if recipe_yield: | ||
return recipe_yield["value"] | ||
else: | ||
return get_yields( | ||
self.soup.find( | ||
"div", {"class": "recipe-adjust-servings__original-serving"} | ||
).get_text() | ||
) | ||
return self.schema.yields() | ||
|
||
def image(self): | ||
image = self.soup.find("img", {"id": "HeaderMediaContent"}) | ||
if image: | ||
# tag = image.find('src') | ||
src = image.get("src", None) | ||
|
||
return src if image else None | ||
if not image: | ||
# No image available | ||
return None | ||
# Prefer data src set for higher image resolution | ||
srcset_str = image.get("data-srcset", None) | ||
if not srcset_str: | ||
# Fallback to regular img source | ||
return image.get("src", None) | ||
# Convert source set string to cleaner list | ||
srcset = srcset_str.split(",") | ||
# Find image with the best resolution | ||
max_res = None | ||
max_res_src = None | ||
res_pattern = re.compile(r"(\d+)w") | ||
for src_str in srcset: | ||
src = src_str.strip().split(" ") | ||
res_match = res_pattern.findall(src[1]) | ||
res = None | ||
if res_match: | ||
try: | ||
res = int(res_match[0]) | ||
except ValueError: | ||
pass | ||
if max_res is None or res > max_res: | ||
max_res = res | ||
max_res_src = src[0] | ||
return max_res_src | ||
|
||
def ingredients(self): | ||
# site uses <section><section>...</section></section> | ||
ingredientsOuter = self.soup.findAll( | ||
"div", {"id": "ingredient-groups-container"} | ||
) | ||
ingGroup = [] | ||
ingredients1 = ingredientsOuter[0].findAll("section") | ||
for ings in ingredients1: | ||
ingredients = ings.findAll("section") | ||
for ingredient in ingredients: | ||
if len(ingredient) > 1: | ||
try: | ||
header = ingredient.find( | ||
"h3", | ||
{ | ||
"class": "t-tertiary-heading o-recipe-ingredients__sub-title" | ||
}, | ||
).get_text() | ||
except Exception: | ||
header = "" | ||
tablelines = ingredient.findAll("td") | ||
lst = [] | ||
cntr = 0 | ||
tmplst = [] | ||
for item in tablelines: | ||
tmplst.append(normalize_string(item.get_text(strip=True))) | ||
cntr += 1 | ||
if cntr % 2 == 0: | ||
lst.append(" ".join(tmplst)) | ||
tmplst = [] | ||
if header != "": | ||
ingGroup.append(header) | ||
ingGroup += lst | ||
return ingGroup | ||
return self.schema.ingredients() | ||
|
||
def instructions(self): | ||
instructions = self.soup.find("ol", {"class": "o-recipe-steps__step-groups"}) | ||
""" | ||
Standard Schema.org implementation, except removes HowToSection with generic 'Oppskrift' header. | ||
These are "pseudo" sections that are not rendered on the actual website. | ||
""" | ||
return "\n".join( | ||
[i for i in self.schema.instructions().split("\n") if i != "Oppskrift"] | ||
) | ||
|
||
ins = instructions.findAll("li", {"class": "o-recipe-steps__step-group"}) | ||
def ratings(self): | ||
return self.schema.ratings() | ||
|
||
return "\n".join([normalize_string(inst.text) for inst in ins]) | ||
def cuisine(self): | ||
return self.schema.cuisine() | ||
|
||
def description(self): | ||
d = normalize_string( | ||
self.soup.find( | ||
"div", | ||
{ | ||
"class": "t-ingress m-page-ingress m-page-ingress--recipe l-recipe__ingress" | ||
}, | ||
).text | ||
) | ||
|
||
return d if d else None | ||
return self.schema.description() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters