Skip to content

Commit

Permalink
Remove yoast plugin requirement (#12)
Browse files Browse the repository at this point in the history
* better handling without yoast plugin

* run ruff
  • Loading branch information
freddyheppell authored Jul 2, 2024
1 parent 248e71a commit de859bd
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/extractor/extractors/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def load_media(path: Path, link_registry: LinkRegistry) -> Optional[pd.DataFrame

media_df = media_df[media_df.columns.intersection(EXPORT_COLUMNS)]

media_df = media_df.rename(columns=RENAME_COLUMNS)
media_df = media_df.rename(columns=RENAME_COLUMNS, errors="ignore")

link_registry.add_linkables(
"media", media_df["source_url"].to_list(), media_df.index.to_list()
Expand Down
2 changes: 1 addition & 1 deletion src/extractor/extractors/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def load_pages(path: Path, link_registry: LinkRegistry) -> Optional[pd.DataFrame
)

pages_df = pages_df[pages_df.columns.intersection(EXPORT_COLUMNS)]
pages_df = pages_df.rename(columns=RENAME_COLUMNS)
pages_df = pages_df.rename(columns=RENAME_COLUMNS, errors="ignore")

link_registry.add_linkables(
"pages", pages_df["link"].to_list(), pages_df.index.to_list()
Expand Down
17 changes: 10 additions & 7 deletions src/extractor/extractors/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,14 @@ def load_posts(

# yoast_head_json.og_image is a list containing 0 or 1 image dictionaries
# Get the "url" property if there is an image
posts_df["og_image_url"] = posts_df["yoast_head_json.og_image"].apply(
lambda image: image[0]["url"]
if not isinstance(image, float) and len(image) > 0
else None
)
if "yoast_head_json.title" in posts_df.columns:
posts_df["og_image_url"] = posts_df["yoast_head_json.og_image"].apply(
lambda image: image[0]["url"]
if not isinstance(image, float) and len(image) > 0
else None
)
else:
posts_df["og_image_url"] = None

posts_df["link_locale"] = posts_df["link"].apply(extract_locale)

Expand Down Expand Up @@ -119,8 +122,8 @@ def load_posts(
lambda r: extract_content_data(r["content.bs"], r["link"]), axis=1
)

posts_df = posts_df[EXPORT_COLUMNS]
posts_df = posts_df.rename(columns=RENAME_COLUMNS)
posts_df = posts_df[posts_df.columns.intersection(EXPORT_COLUMNS)]
posts_df = posts_df.rename(columns=RENAME_COLUMNS, errors="ignore")

return posts_df

Expand Down
9 changes: 6 additions & 3 deletions src/extractor/extractors/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ def load_users(path: Path) -> Optional[pd.DataFrame]:
if users_df is None:
return None

users_df["avatar"] = users_df["yoast_head_json.og_image"].apply(
lambda image: image[0]["url"] if len(image) > 0 else None
)
if "yoast_head_json.og_image" in users_df.columns:
users_df["avatar"] = users_df["yoast_head_json.og_image"].apply(
lambda image: image[0]["url"] if len(image) > 0 else None
)
else:
users_df["avatar"] = None

users_df = users_df[users_df.columns.intersection(EXPORT_COLUMNS)]

Expand Down
8 changes: 8 additions & 0 deletions tests/extractors/test_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from extractor.parse.translations._resolver import TranslationLink
from helpers.df import ordered_col
from helpers.file import json_without_cols
from pytest_mock import MockerFixture


Expand Down Expand Up @@ -198,3 +199,10 @@ def test_resolves_media(posts_df_and_registry):
data_type="media",
idx=1,
)


def test_no_yoast_columns(datadir, scrape_urls_files):
path = json_without_cols(datadir / "posts.json", {"yoast_head", "yoast_head_json"})

posts_df = load_posts(path, LinkRegistry(), scrape_urls_files, None)
assert posts_df.iloc[0].og_image_url is None
7 changes: 7 additions & 0 deletions tests/extractors/test_users.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import pytest
from extractor.extractors.users import load_users
from helpers.file import json_without_cols


@pytest.fixture()
Expand All @@ -11,3 +12,9 @@ def users_df(datadir):
def test_user_load(datadir, users_df):
expected_df = pd.read_json(datadir / "users_df_out.json", orient="table")
assert users_df.equals(expected_df)


def test_no_yoast_columns(datadir):
path = json_without_cols(datadir / "users.json", {"yoast_head", "yoast_head_json"})
users_df = load_users(path)
assert users_df.iloc[0].avatar is None
19 changes: 19 additions & 0 deletions tests/helpers/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import json
import tempfile
from pathlib import Path
from typing import Set


def json_without_cols(in_file: Path, del_cols: Set[str]) -> Path:
in_data = json.loads(in_file.read_text())
delete_keys = {"yoast_head", "yoast_head_json"}
out_data = [
{key: item[key] for key, value in item.items() if key not in delete_keys}
for item in in_data
]

with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f:
json.dump(out_data, f)
path = f.name

return Path(path)

0 comments on commit de859bd

Please sign in to comment.