diff --git a/src/extractor/extractors/media.py b/src/extractor/extractors/media.py index d98230e..a55b9ed 100644 --- a/src/extractor/extractors/media.py +++ b/src/extractor/extractors/media.py @@ -88,7 +88,7 @@ def load_media(path: Path, link_registry: LinkRegistry) -> Optional[pd.DataFrame media_df = media_df[media_df.columns.intersection(EXPORT_COLUMNS)] - media_df = media_df.rename(columns=RENAME_COLUMNS) + media_df = media_df.rename(columns=RENAME_COLUMNS, errors="ignore") link_registry.add_linkables( "media", media_df["source_url"].to_list(), media_df.index.to_list() diff --git a/src/extractor/extractors/pages.py b/src/extractor/extractors/pages.py index afbfc34..aaf8a61 100644 --- a/src/extractor/extractors/pages.py +++ b/src/extractor/extractors/pages.py @@ -78,7 +78,7 @@ def load_pages(path: Path, link_registry: LinkRegistry) -> Optional[pd.DataFrame ) pages_df = pages_df[pages_df.columns.intersection(EXPORT_COLUMNS)] - pages_df = pages_df.rename(columns=RENAME_COLUMNS) + pages_df = pages_df.rename(columns=RENAME_COLUMNS, errors="ignore") link_registry.add_linkables( "pages", pages_df["link"].to_list(), pages_df.index.to_list() diff --git a/src/extractor/extractors/posts.py b/src/extractor/extractors/posts.py index a00fc12..b1922bd 100644 --- a/src/extractor/extractors/posts.py +++ b/src/extractor/extractors/posts.py @@ -85,11 +85,14 @@ def load_posts( # yoast_head_json.og_image is a list containing 0 or 1 image dictionaries # Get the "url" property if there is an image - posts_df["og_image_url"] = posts_df["yoast_head_json.og_image"].apply( - lambda image: image[0]["url"] - if not isinstance(image, float) and len(image) > 0 - else None - ) + if "yoast_head_json.title" in posts_df.columns: + posts_df["og_image_url"] = posts_df["yoast_head_json.og_image"].apply( + lambda image: image[0]["url"] + if not isinstance(image, float) and len(image) > 0 + else None + ) + else: + posts_df["og_image_url"] = None posts_df["link_locale"] = posts_df["link"].apply(extract_locale) @@ -119,8 +122,8 @@ def load_posts( lambda r: extract_content_data(r["content.bs"], r["link"]), axis=1 ) - posts_df = posts_df[EXPORT_COLUMNS] - posts_df = posts_df.rename(columns=RENAME_COLUMNS) + posts_df = posts_df[posts_df.columns.intersection(EXPORT_COLUMNS)] + posts_df = posts_df.rename(columns=RENAME_COLUMNS, errors="ignore") return posts_df diff --git a/src/extractor/extractors/users.py b/src/extractor/extractors/users.py index a85cefc..2656630 100644 --- a/src/extractor/extractors/users.py +++ b/src/extractor/extractors/users.py @@ -22,9 +22,12 @@ def load_users(path: Path) -> Optional[pd.DataFrame]: if users_df is None: return None - users_df["avatar"] = users_df["yoast_head_json.og_image"].apply( - lambda image: image[0]["url"] if len(image) > 0 else None - ) + if "yoast_head_json.og_image" in users_df.columns: + users_df["avatar"] = users_df["yoast_head_json.og_image"].apply( + lambda image: image[0]["url"] if len(image) > 0 else None + ) + else: + users_df["avatar"] = None users_df = users_df[users_df.columns.intersection(EXPORT_COLUMNS)] diff --git a/tests/extractors/test_posts.py b/tests/extractors/test_posts.py index 548bcef..0e6eb66 100644 --- a/tests/extractors/test_posts.py +++ b/tests/extractors/test_posts.py @@ -14,6 +14,7 @@ ) from extractor.parse.translations._resolver import TranslationLink from helpers.df import ordered_col +from helpers.file import json_without_cols from pytest_mock import MockerFixture @@ -198,3 +199,10 @@ def test_resolves_media(posts_df_and_registry): data_type="media", idx=1, ) + + +def test_no_yoast_columns(datadir, scrape_urls_files): + path = json_without_cols(datadir / "posts.json", {"yoast_head", "yoast_head_json"}) + + posts_df = load_posts(path, LinkRegistry(), scrape_urls_files, None) + assert posts_df.iloc[0].og_image_url is None diff --git a/tests/extractors/test_users.py b/tests/extractors/test_users.py index 9d219ba..bb327e8 100644 --- a/tests/extractors/test_users.py +++ b/tests/extractors/test_users.py @@ -1,6 +1,7 @@ import pandas as pd import pytest from extractor.extractors.users import load_users +from helpers.file import json_without_cols @pytest.fixture() @@ -11,3 +12,9 @@ def users_df(datadir): def test_user_load(datadir, users_df): expected_df = pd.read_json(datadir / "users_df_out.json", orient="table") assert users_df.equals(expected_df) + + +def test_no_yoast_columns(datadir): + path = json_without_cols(datadir / "users.json", {"yoast_head", "yoast_head_json"}) + users_df = load_users(path) + assert users_df.iloc[0].avatar is None diff --git a/tests/helpers/file.py b/tests/helpers/file.py new file mode 100644 index 0000000..198ad40 --- /dev/null +++ b/tests/helpers/file.py @@ -0,0 +1,19 @@ +import json +import tempfile +from pathlib import Path +from typing import Set + + +def json_without_cols(in_file: Path, del_cols: Set[str]) -> Path: + in_data = json.loads(in_file.read_text()) + delete_keys = {"yoast_head", "yoast_head_json"} + out_data = [ + {key: item[key] for key, value in item.items() if key not in delete_keys} + for item in in_data + ] + + with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f: + json.dump(out_data, f) + path = f.name + + return Path(path)