diff --git a/docs/changelog.md b/docs/changelog.md
index 5c03422..936ef78 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,12 @@
# Changelog
+## 1.1.0 (upcoming)
+
+**Fixes**
+
+- Fixed the scrape crawling step crashing if a page didn't have a canonical link or `og:url` meta tag
+- Fixed the scrape crawling not correctly recognising when duplicate URLs were encountered. Previously duplicates would be included, but only one would be used. Now, they will be correctly logged. As a result of this change, the `SCRAPE_CRAWL_VERSION` has been incremented, meaning running extraction on a scrape will cause it to be re-crawled.
+
## 1.0.3 (2024-08-06)
**Changes**
diff --git a/src/wpextract/scrape/crawler.py b/src/wpextract/scrape/crawler.py
index 3450c3a..cf3a547 100644
--- a/src/wpextract/scrape/crawler.py
+++ b/src/wpextract/scrape/crawler.py
@@ -8,7 +8,7 @@
from wpextract.scrape.processor import extract_self_url, self_url_strainer
# Increment to invalidate old caches
-SCRAPE_CRAWL_VERSION = 1
+SCRAPE_CRAWL_VERSION = 2
class ScrapeCrawl:
@@ -28,7 +28,7 @@ class ScrapeCrawl:
"""
found_pages: dict[str, str]
- failed_docs: list[Path]
+ failed_docs: list[str]
crawled = False
def __init__(self, root_path: Path) -> None:
@@ -95,14 +95,13 @@ def crawl(self) -> None:
doc_url = extract_self_url(doc)
if doc_url is None:
- self.failed_docs.append(path)
- logging.warning(f'Failed to find self-URL in doc "{path}"')
+ self.failed_docs.append(relative_path)
+ logging.warning(f'Failed to find self-URL in doc "{relative_path}"')
continue
- if doc_url in self.found_pages:
+ if doc_url in self.found_pages.values():
logging.info(
- f"URL {doc_url} retrieved for {relative_path}, but has already been"
- f"found for {self.found_pages[relative_path]}"
+ f"URL {doc_url} retrieved for {relative_path}, but has already been found"
)
continue
diff --git a/tests/download/test_downloader.py b/tests/download/test_downloader.py
index 79f1a3d..b2b493c 100644
--- a/tests/download/test_downloader.py
+++ b/tests/download/test_downloader.py
@@ -25,12 +25,6 @@ def _fake_api_return():
return [{"id": idx, "title": "dummy return"} for idx in range(20)]
-def _export_method(datatype):
- if datatype == "comments":
- return "export_comments_interactive"
- return f"export_{datatype}"
-
-
def _mocked_exporter(mocker, datatype):
cls = "wpextract.download.exporter.Exporter."
if datatype == "comments":
diff --git a/tests/extractors/test_io.py b/tests/extractors/test_io.py
index 68d7e6a..d2c78e4 100644
--- a/tests/extractors/test_io.py
+++ b/tests/extractors/test_io.py
@@ -1,6 +1,10 @@
+import numpy as np
import pandas as pd
+import pytest
from helpers.df import ordered_col
from wpextract.extractors.io import (
+ _remove_nan,
+ _set_nested_keys,
df_denormalize_to_dict,
export_df,
load_df,
@@ -30,7 +34,19 @@ def test_load_df(datadir):
assert df["nested1.nested2"].equals(ordered_col(["value1", "value2"]))
-def test_df_denormalize(datadir):
+def test_load_df_from_path_doesnt_exist(datadir):
+ df = load_df(datadir / "notreal.json")
+
+ assert df is None
+
+
+def test_load_df_empty(datadir):
+ df = load_df(datadir / "empty.json")
+
+ assert df is None
+
+
+def test_df_denormalize():
df = pd.DataFrame(
[("a", "b"), ("c", "d")], columns=["one", "two.three"], index=[1, 2]
)
@@ -51,3 +67,33 @@ def test_export_df(datadir, tmp_path):
out_loaded = load_df(out_path)
assert df.equals(out_loaded)
+
+
+def test_export_df_none(tmp_path):
+ out_path = tmp_path / "out.json"
+ export_df(None, out_path)
+
+ assert out_path.exists()
+
+ assert out_path.read_text() == "[]"
+
+
+def test_set_nested_keys():
+ res = _set_nested_keys({}, ["one", "two", "three"], "value")
+ assert res == {"one": {"two": {"three": "value"}}}
+
+ with pytest.raises(ValueError, match="is already set"):
+ _set_nested_keys({"one": "two"}, ["one", "two", "three"], "value")
+
+
+@pytest.mark.parametrize(
+ ("inp", "exp_out"),
+ [
+ (pd.NA, None),
+ (np.nan, None),
+ ([1, pd.NA, 2, np.nan], [1, None, 2, None]),
+ ({"a": "foo", "b": pd.NA, "c": np.nan}, {"a": "foo", "b": None, "c": None}),
+ ],
+)
+def test_remove_nan(inp, exp_out):
+ assert _remove_nan(inp) == exp_out
diff --git a/tests/extractors/test_io/empty.json b/tests/extractors/test_io/empty.json
new file mode 100644
index 0000000..0637a08
--- /dev/null
+++ b/tests/extractors/test_io/empty.json
@@ -0,0 +1 @@
+[]
\ No newline at end of file
diff --git a/tests/parse/test_content.py b/tests/parse/test_content.py
index 2e4d370..5e1e19a 100644
--- a/tests/parse/test_content.py
+++ b/tests/parse/test_content.py
@@ -35,6 +35,17 @@ def test_extract_links(datadir: Path):
assert external == [Link(text="An external link", href="https://gate.ac.uk")]
+def test_extract_links_no_href():
+ doc = BeautifulSoup("No href", "lxml")
+
+ internal, external = extract_links(doc, "https://example.org/home")
+
+ assert internal == []
+ assert external == [
+ Link(text="No href", href=None),
+ ]
+
+
def test_extract_embeds(datadir: Path):
doc = BeautifulSoup((datadir / "embeds.html").read_text(), "lxml")
@@ -70,7 +81,7 @@ def test_extract_images(datadir: Path):
]
-def test_extract_image_without_src(datadir: Path):
+def test_extract_image_without_src():
doc = BeautifulSoup("", "lxml")
images = extract_images(doc, "https://example.org/home")
diff --git a/tests/parse/translations/test_pickers/polylang_custom_dropdown.html b/tests/parse/translations/data/polylang_custom_dropdown.html
similarity index 100%
rename from tests/parse/translations/test_pickers/polylang_custom_dropdown.html
rename to tests/parse/translations/data/polylang_custom_dropdown.html
diff --git a/tests/parse/translations/test_pickers/polylang_widget.html b/tests/parse/translations/data/polylang_widget.html
similarity index 100%
rename from tests/parse/translations/test_pickers/polylang_widget.html
rename to tests/parse/translations/data/polylang_widget.html
diff --git a/tests/parse/translations/test_extractor.py b/tests/parse/translations/test_extractor.py
new file mode 100644
index 0000000..f3d1480
--- /dev/null
+++ b/tests/parse/translations/test_extractor.py
@@ -0,0 +1,50 @@
+import logging
+
+import pytest
+from bs4 import BeautifulSoup
+from wpextract.parse.translations import extract_translations
+from wpextract.parse.translations._pickers import PolylangCustomDropdown, PolylangWidget
+
+
+class FaultyDummyPicker(PolylangWidget):
+ def extract(self) -> None:
+ raise self._build_extraction_fail_err(".dummy")
+
+
+@pytest.fixture()
+def parsed_page(shared_datadir):
+ soup = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml")
+ return soup
+
+
+def test_extract_translations(parsed_page):
+ res = extract_translations(
+ parsed_page, "https://example.org/current-lang-page/", translation_pickers=None
+ )
+
+ assert str(res.iloc[0]) == "en-US"
+ assert len(res.iloc[1]) == 1
+
+
+def test_none_matching(caplog, parsed_page):
+ with caplog.at_level(logging.DEBUG):
+ res = extract_translations(
+ parsed_page,
+ "https://example.org/current-lang-page/",
+ translation_pickers=[PolylangCustomDropdown],
+ )
+ assert res.iloc[0] is None
+ assert res.iloc[1] == []
+
+ assert "No translation pickers matched" in caplog.text
+
+
+def test_error_extracting(caplog, parsed_page):
+ res = extract_translations(
+ parsed_page,
+ "https://example.org/current-lang-page/",
+ translation_pickers=[FaultyDummyPicker],
+ )
+
+ assert res.iloc[0] is None
+ assert "but failed to select element with: .dummy" in caplog.text
diff --git a/tests/parse/translations/test_pickers.py b/tests/parse/translations/test_pickers.py
index 403b714..c611c6d 100644
--- a/tests/parse/translations/test_pickers.py
+++ b/tests/parse/translations/test_pickers.py
@@ -14,8 +14,10 @@
(pickers.PolylangCustomDropdown, "polylang_custom_dropdown.html"),
],
)
-def test_picker(datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str):
- doc = BeautifulSoup((datadir / picker_file).read_text(), "lxml")
+def test_picker(
+ shared_datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str
+):
+ doc = BeautifulSoup((shared_datadir / picker_file).read_text(), "lxml")
picker = picker_cls(doc)
@@ -48,8 +50,10 @@ def extract(self) -> None:
@pytest.mark.parametrize(
"picker_cls", [FaultyExtractPickerSelect, FaultyExtractPickerSelectOne]
)
-def test_picker_extract_error(datadir: Path, picker_cls: type[pickers.LangPicker]):
- doc = BeautifulSoup((datadir / "polylang_widget.html").read_text(), "lxml")
+def test_picker_extract_error(
+ shared_datadir: Path, picker_cls: type[pickers.LangPicker]
+):
+ doc = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml")
picker = picker_cls(doc)
assert picker.matches()
diff --git a/tests/parse/translations/test_resolver.py b/tests/parse/translations/test_resolver.py
new file mode 100644
index 0000000..7a81c5c
--- /dev/null
+++ b/tests/parse/translations/test_resolver.py
@@ -0,0 +1,23 @@
+from typing import Union
+
+import pytest
+from langcodes import Language
+from wpextract.parse.translations import TranslationLink
+
+
+@pytest.mark.parametrize(
+ ("input_lang", "expected_language"),
+ [
+ ("en", "en"),
+ ("en-GB", "en-GB"),
+ ("fr-FR", "fr-FR"),
+ (Language.get("en-GB"), "en-GB"),
+ ("zho", "zh"),
+ ],
+)
+def test_translation_link_lang(
+ input_lang: Union[str, Language], expected_language: str
+):
+ link = TranslationLink(text=None, href=None, destination=None, lang=input_lang)
+
+ assert str(link.language) == expected_language
diff --git a/tests/scrape/data/diff_version_url_cache.json b/tests/scrape/data/diff_version_url_cache.json
new file mode 100644
index 0000000..405e1e6
--- /dev/null
+++ b/tests/scrape/data/diff_version_url_cache.json
@@ -0,0 +1 @@
+{"found": {}, "failed": [], "version": 1}
\ No newline at end of file
diff --git a/tests/scrape/data/expected_url_cache.json b/tests/scrape/data/expected_url_cache.json
new file mode 100644
index 0000000..4539f30
--- /dev/null
+++ b/tests/scrape/data/expected_url_cache.json
@@ -0,0 +1 @@
+{"found": {"fr/an-example-post-translation/index.html": "https://example.org/fr/an-example-post-translation/", "an-example-post/index.html": "https://example.org/an-example-post/", "an-unrelated-post/index.html": "https://example.org/an-unrelated-post/"}, "failed": ["no-self-url.html"], "version": 2}
\ No newline at end of file
diff --git a/tests/scrape/data/scrape/an-example-post/index.html b/tests/scrape/data/scrape/an-example-post/index.html
new file mode 100644
index 0000000..c641d63
--- /dev/null
+++ b/tests/scrape/data/scrape/an-example-post/index.html
@@ -0,0 +1,19 @@
+
+
+
This is an example post
+It has two paragraphs.
+This is an unrelated post.
+It has two paragraphs.
+ + diff --git a/tests/scrape/data/scrape/fr/an-example-post-translation/index.html b/tests/scrape/data/scrape/fr/an-example-post-translation/index.html new file mode 100644 index 0000000..2a1dc88 --- /dev/null +++ b/tests/scrape/data/scrape/fr/an-example-post-translation/index.html @@ -0,0 +1,21 @@ + + + + + + +This is a translation of the post.
+It has two paragraphs.
+ + diff --git a/tests/scrape/data/scrape/no-self-url.html b/tests/scrape/data/scrape/no-self-url.html new file mode 100644 index 0000000..152ee7f --- /dev/null +++ b/tests/scrape/data/scrape/no-self-url.html @@ -0,0 +1,9 @@ + + + +