From 0610aa78f935c717385f75aeaa8318a7ed36a29b Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Thu, 8 Aug 2024 17:42:03 +0100 Subject: [PATCH] more tests for download/parse/extract --- docs/changelog.md | 7 ++ src/wpextract/scrape/crawler.py | 13 ++- tests/download/test_downloader.py | 6 -- tests/extractors/test_io.py | 48 ++++++++++- tests/extractors/test_io/empty.json | 1 + tests/parse/test_content.py | 13 ++- .../polylang_custom_dropdown.html | 0 .../polylang_widget.html | 0 tests/parse/translations/test_extractor.py | 50 ++++++++++++ tests/parse/translations/test_pickers.py | 12 ++- tests/parse/translations/test_resolver.py | 23 ++++++ tests/scrape/data/diff_version_url_cache.json | 1 + tests/scrape/data/expected_url_cache.json | 1 + .../data/scrape/an-example-post/index.html | 19 +++++ .../data/scrape/an-unrelated-post/index.html | 18 +++++ .../fr/an-example-post-translation/index.html | 21 +++++ tests/scrape/data/scrape/no-self-url.html | 9 +++ tests/scrape/test_crawler.py | 80 +++++++++++++++++++ tests/scrape/test_processor.py | 65 +++++++++++++++ .../scrape/test_processor/link_canonical.html | 10 +++ .../link_canonical_empty_href.html | 10 +++ .../link_canonical_no_href.html | 10 +++ tests/scrape/test_processor/no_head.html | 6 ++ tests/scrape/test_processor/no_self_url.html | 9 +++ tests/scrape/test_processor/og_url.html | 10 +++ .../test_processor/og_url_empty_content.html | 10 +++ .../test_processor/og_url_no_content.html | 10 +++ .../scrape/test_processor/self_url_both.html | 11 +++ tests/scrape/test_scrape.py | 16 ++++ 29 files changed, 470 insertions(+), 19 deletions(-) create mode 100644 tests/extractors/test_io/empty.json rename tests/parse/translations/{test_pickers => data}/polylang_custom_dropdown.html (100%) rename tests/parse/translations/{test_pickers => data}/polylang_widget.html (100%) create mode 100644 tests/parse/translations/test_extractor.py create mode 100644 tests/parse/translations/test_resolver.py create mode 100644 tests/scrape/data/diff_version_url_cache.json create mode 100644 tests/scrape/data/expected_url_cache.json create mode 100644 tests/scrape/data/scrape/an-example-post/index.html create mode 100644 tests/scrape/data/scrape/an-unrelated-post/index.html create mode 100644 tests/scrape/data/scrape/fr/an-example-post-translation/index.html create mode 100644 tests/scrape/data/scrape/no-self-url.html create mode 100644 tests/scrape/test_crawler.py create mode 100644 tests/scrape/test_processor.py create mode 100644 tests/scrape/test_processor/link_canonical.html create mode 100644 tests/scrape/test_processor/link_canonical_empty_href.html create mode 100644 tests/scrape/test_processor/link_canonical_no_href.html create mode 100644 tests/scrape/test_processor/no_head.html create mode 100644 tests/scrape/test_processor/no_self_url.html create mode 100644 tests/scrape/test_processor/og_url.html create mode 100644 tests/scrape/test_processor/og_url_empty_content.html create mode 100644 tests/scrape/test_processor/og_url_no_content.html create mode 100644 tests/scrape/test_processor/self_url_both.html create mode 100644 tests/scrape/test_scrape.py diff --git a/docs/changelog.md b/docs/changelog.md index 5c03422..936ef78 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,12 @@ # Changelog +## 1.1.0 (upcoming) + +**Fixes** + +- Fixed the scrape crawling step crashing if a page didn't have a canonical link or `og:url` meta tag +- Fixed the scrape crawling not correctly recognising when duplicate URLs were encountered. Previously duplicates would be included, but only one would be used. Now, they will be correctly logged. As a result of this change, the `SCRAPE_CRAWL_VERSION` has been incremented, meaning running extraction on a scrape will cause it to be re-crawled. + ## 1.0.3 (2024-08-06) **Changes** diff --git a/src/wpextract/scrape/crawler.py b/src/wpextract/scrape/crawler.py index 3450c3a..cf3a547 100644 --- a/src/wpextract/scrape/crawler.py +++ b/src/wpextract/scrape/crawler.py @@ -8,7 +8,7 @@ from wpextract.scrape.processor import extract_self_url, self_url_strainer # Increment to invalidate old caches -SCRAPE_CRAWL_VERSION = 1 +SCRAPE_CRAWL_VERSION = 2 class ScrapeCrawl: @@ -28,7 +28,7 @@ class ScrapeCrawl: """ found_pages: dict[str, str] - failed_docs: list[Path] + failed_docs: list[str] crawled = False def __init__(self, root_path: Path) -> None: @@ -95,14 +95,13 @@ def crawl(self) -> None: doc_url = extract_self_url(doc) if doc_url is None: - self.failed_docs.append(path) - logging.warning(f'Failed to find self-URL in doc "{path}"') + self.failed_docs.append(relative_path) + logging.warning(f'Failed to find self-URL in doc "{relative_path}"') continue - if doc_url in self.found_pages: + if doc_url in self.found_pages.values(): logging.info( - f"URL {doc_url} retrieved for {relative_path}, but has already been" - f"found for {self.found_pages[relative_path]}" + f"URL {doc_url} retrieved for {relative_path}, but has already been found" ) continue diff --git a/tests/download/test_downloader.py b/tests/download/test_downloader.py index 79f1a3d..b2b493c 100644 --- a/tests/download/test_downloader.py +++ b/tests/download/test_downloader.py @@ -25,12 +25,6 @@ def _fake_api_return(): return [{"id": idx, "title": "dummy return"} for idx in range(20)] -def _export_method(datatype): - if datatype == "comments": - return "export_comments_interactive" - return f"export_{datatype}" - - def _mocked_exporter(mocker, datatype): cls = "wpextract.download.exporter.Exporter." if datatype == "comments": diff --git a/tests/extractors/test_io.py b/tests/extractors/test_io.py index 68d7e6a..d2c78e4 100644 --- a/tests/extractors/test_io.py +++ b/tests/extractors/test_io.py @@ -1,6 +1,10 @@ +import numpy as np import pandas as pd +import pytest from helpers.df import ordered_col from wpextract.extractors.io import ( + _remove_nan, + _set_nested_keys, df_denormalize_to_dict, export_df, load_df, @@ -30,7 +34,19 @@ def test_load_df(datadir): assert df["nested1.nested2"].equals(ordered_col(["value1", "value2"])) -def test_df_denormalize(datadir): +def test_load_df_from_path_doesnt_exist(datadir): + df = load_df(datadir / "notreal.json") + + assert df is None + + +def test_load_df_empty(datadir): + df = load_df(datadir / "empty.json") + + assert df is None + + +def test_df_denormalize(): df = pd.DataFrame( [("a", "b"), ("c", "d")], columns=["one", "two.three"], index=[1, 2] ) @@ -51,3 +67,33 @@ def test_export_df(datadir, tmp_path): out_loaded = load_df(out_path) assert df.equals(out_loaded) + + +def test_export_df_none(tmp_path): + out_path = tmp_path / "out.json" + export_df(None, out_path) + + assert out_path.exists() + + assert out_path.read_text() == "[]" + + +def test_set_nested_keys(): + res = _set_nested_keys({}, ["one", "two", "three"], "value") + assert res == {"one": {"two": {"three": "value"}}} + + with pytest.raises(ValueError, match="is already set"): + _set_nested_keys({"one": "two"}, ["one", "two", "three"], "value") + + +@pytest.mark.parametrize( + ("inp", "exp_out"), + [ + (pd.NA, None), + (np.nan, None), + ([1, pd.NA, 2, np.nan], [1, None, 2, None]), + ({"a": "foo", "b": pd.NA, "c": np.nan}, {"a": "foo", "b": None, "c": None}), + ], +) +def test_remove_nan(inp, exp_out): + assert _remove_nan(inp) == exp_out diff --git a/tests/extractors/test_io/empty.json b/tests/extractors/test_io/empty.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/tests/extractors/test_io/empty.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/tests/parse/test_content.py b/tests/parse/test_content.py index 2e4d370..5e1e19a 100644 --- a/tests/parse/test_content.py +++ b/tests/parse/test_content.py @@ -35,6 +35,17 @@ def test_extract_links(datadir: Path): assert external == [Link(text="An external link", href="https://gate.ac.uk")] +def test_extract_links_no_href(): + doc = BeautifulSoup("No href", "lxml") + + internal, external = extract_links(doc, "https://example.org/home") + + assert internal == [] + assert external == [ + Link(text="No href", href=None), + ] + + def test_extract_embeds(datadir: Path): doc = BeautifulSoup((datadir / "embeds.html").read_text(), "lxml") @@ -70,7 +81,7 @@ def test_extract_images(datadir: Path): ] -def test_extract_image_without_src(datadir: Path): +def test_extract_image_without_src(): doc = BeautifulSoup("No src", "lxml") images = extract_images(doc, "https://example.org/home") diff --git a/tests/parse/translations/test_pickers/polylang_custom_dropdown.html b/tests/parse/translations/data/polylang_custom_dropdown.html similarity index 100% rename from tests/parse/translations/test_pickers/polylang_custom_dropdown.html rename to tests/parse/translations/data/polylang_custom_dropdown.html diff --git a/tests/parse/translations/test_pickers/polylang_widget.html b/tests/parse/translations/data/polylang_widget.html similarity index 100% rename from tests/parse/translations/test_pickers/polylang_widget.html rename to tests/parse/translations/data/polylang_widget.html diff --git a/tests/parse/translations/test_extractor.py b/tests/parse/translations/test_extractor.py new file mode 100644 index 0000000..f3d1480 --- /dev/null +++ b/tests/parse/translations/test_extractor.py @@ -0,0 +1,50 @@ +import logging + +import pytest +from bs4 import BeautifulSoup +from wpextract.parse.translations import extract_translations +from wpextract.parse.translations._pickers import PolylangCustomDropdown, PolylangWidget + + +class FaultyDummyPicker(PolylangWidget): + def extract(self) -> None: + raise self._build_extraction_fail_err(".dummy") + + +@pytest.fixture() +def parsed_page(shared_datadir): + soup = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml") + return soup + + +def test_extract_translations(parsed_page): + res = extract_translations( + parsed_page, "https://example.org/current-lang-page/", translation_pickers=None + ) + + assert str(res.iloc[0]) == "en-US" + assert len(res.iloc[1]) == 1 + + +def test_none_matching(caplog, parsed_page): + with caplog.at_level(logging.DEBUG): + res = extract_translations( + parsed_page, + "https://example.org/current-lang-page/", + translation_pickers=[PolylangCustomDropdown], + ) + assert res.iloc[0] is None + assert res.iloc[1] == [] + + assert "No translation pickers matched" in caplog.text + + +def test_error_extracting(caplog, parsed_page): + res = extract_translations( + parsed_page, + "https://example.org/current-lang-page/", + translation_pickers=[FaultyDummyPicker], + ) + + assert res.iloc[0] is None + assert "but failed to select element with: .dummy" in caplog.text diff --git a/tests/parse/translations/test_pickers.py b/tests/parse/translations/test_pickers.py index 403b714..c611c6d 100644 --- a/tests/parse/translations/test_pickers.py +++ b/tests/parse/translations/test_pickers.py @@ -14,8 +14,10 @@ (pickers.PolylangCustomDropdown, "polylang_custom_dropdown.html"), ], ) -def test_picker(datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str): - doc = BeautifulSoup((datadir / picker_file).read_text(), "lxml") +def test_picker( + shared_datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str +): + doc = BeautifulSoup((shared_datadir / picker_file).read_text(), "lxml") picker = picker_cls(doc) @@ -48,8 +50,10 @@ def extract(self) -> None: @pytest.mark.parametrize( "picker_cls", [FaultyExtractPickerSelect, FaultyExtractPickerSelectOne] ) -def test_picker_extract_error(datadir: Path, picker_cls: type[pickers.LangPicker]): - doc = BeautifulSoup((datadir / "polylang_widget.html").read_text(), "lxml") +def test_picker_extract_error( + shared_datadir: Path, picker_cls: type[pickers.LangPicker] +): + doc = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml") picker = picker_cls(doc) assert picker.matches() diff --git a/tests/parse/translations/test_resolver.py b/tests/parse/translations/test_resolver.py new file mode 100644 index 0000000..7a81c5c --- /dev/null +++ b/tests/parse/translations/test_resolver.py @@ -0,0 +1,23 @@ +from typing import Union + +import pytest +from langcodes import Language +from wpextract.parse.translations import TranslationLink + + +@pytest.mark.parametrize( + ("input_lang", "expected_language"), + [ + ("en", "en"), + ("en-GB", "en-GB"), + ("fr-FR", "fr-FR"), + (Language.get("en-GB"), "en-GB"), + ("zho", "zh"), + ], +) +def test_translation_link_lang( + input_lang: Union[str, Language], expected_language: str +): + link = TranslationLink(text=None, href=None, destination=None, lang=input_lang) + + assert str(link.language) == expected_language diff --git a/tests/scrape/data/diff_version_url_cache.json b/tests/scrape/data/diff_version_url_cache.json new file mode 100644 index 0000000..405e1e6 --- /dev/null +++ b/tests/scrape/data/diff_version_url_cache.json @@ -0,0 +1 @@ +{"found": {}, "failed": [], "version": 1} \ No newline at end of file diff --git a/tests/scrape/data/expected_url_cache.json b/tests/scrape/data/expected_url_cache.json new file mode 100644 index 0000000..4539f30 --- /dev/null +++ b/tests/scrape/data/expected_url_cache.json @@ -0,0 +1 @@ +{"found": {"fr/an-example-post-translation/index.html": "https://example.org/fr/an-example-post-translation/", "an-example-post/index.html": "https://example.org/an-example-post/", "an-unrelated-post/index.html": "https://example.org/an-unrelated-post/"}, "failed": ["no-self-url.html"], "version": 2} \ No newline at end of file diff --git a/tests/scrape/data/scrape/an-example-post/index.html b/tests/scrape/data/scrape/an-example-post/index.html new file mode 100644 index 0000000..c641d63 --- /dev/null +++ b/tests/scrape/data/scrape/an-example-post/index.html @@ -0,0 +1,19 @@ + + + + + + + An Example Post - example.org + + + + +

This is an example post

+

It has two paragraphs.

+ an image + + diff --git a/tests/scrape/data/scrape/an-unrelated-post/index.html b/tests/scrape/data/scrape/an-unrelated-post/index.html new file mode 100644 index 0000000..f55ffa8 --- /dev/null +++ b/tests/scrape/data/scrape/an-unrelated-post/index.html @@ -0,0 +1,18 @@ + + + + + + + An Unrelated Post - example.org + + + + +

This is an unrelated post.

+

It has two paragraphs.

+ + diff --git a/tests/scrape/data/scrape/fr/an-example-post-translation/index.html b/tests/scrape/data/scrape/fr/an-example-post-translation/index.html new file mode 100644 index 0000000..2a1dc88 --- /dev/null +++ b/tests/scrape/data/scrape/fr/an-example-post-translation/index.html @@ -0,0 +1,21 @@ + + + + + + + An Example Post Translation - example.org + + + + +

This is a translation of the post.

+

It has two paragraphs.

+ + diff --git a/tests/scrape/data/scrape/no-self-url.html b/tests/scrape/data/scrape/no-self-url.html new file mode 100644 index 0000000..152ee7f --- /dev/null +++ b/tests/scrape/data/scrape/no-self-url.html @@ -0,0 +1,9 @@ + + + + Test Document + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_crawler.py b/tests/scrape/test_crawler.py new file mode 100644 index 0000000..8dc06af --- /dev/null +++ b/tests/scrape/test_crawler.py @@ -0,0 +1,80 @@ +import logging +from pathlib import Path + +import pytest +from wpextract.scrape.crawler import ScrapeCrawl + +EXPECTED_PAGES = { + "an-example-post/index.html": "https://example.org/an-example-post/", + "an-unrelated-post/index.html": "https://example.org/an-unrelated-post/", + "fr/an-example-post-translation/index.html": "https://example.org/fr/an-example-post-translation/", +} + + +def expected_link_abs_path(root_dir: Path): + root_path = str(root_dir.resolve()) + return {link: Path(root_path + "/" + path) for path, link in EXPECTED_PAGES.items()} + + +def test_scrape_crawl(shared_datadir): + root_path = shared_datadir / "scrape" + + crawl = ScrapeCrawl(root_path) + + crawl.crawl() + + assert crawl.found_pages == EXPECTED_PAGES + + assert crawl.failed_docs == ["no-self-url.html"] + + assert crawl.get_link_abs_path() == expected_link_abs_path(root_path) + + assert (root_path / "url_cache.json").is_file() + assert (root_path / "url_cache.json").read_text() == ( + shared_datadir / "expected_url_cache.json" + ).read_text() + + +def test_recrawl_different_version(caplog, shared_datadir: Path): + root_path = shared_datadir / "scrape" + (shared_datadir / "diff_version_url_cache.json").rename( + root_path / "url_cache.json" + ) + + with caplog.at_level(logging.INFO): + crawl = ScrapeCrawl(root_path) + crawl.crawl() + + assert crawl.found_pages != {} + assert "out of date" in caplog.text + + +def test_scrape_crawl_cached(shared_datadir: Path): + root_path = shared_datadir / "scrape" + crawl = ScrapeCrawl(root_path) + crawl.crawl() + + (root_path / "an-example-post/index.html").unlink() + + crawl2 = ScrapeCrawl(root_path) + crawl2.crawl() + assert crawl2.found_pages == EXPECTED_PAGES + + +def test_duplicate_url(caplog, shared_datadir: Path): + root_path = shared_datadir / "scrape" + (root_path / "dupe.html").write_text( + (root_path / "an-example-post/index.html").read_text() + ) + + with caplog.at_level(logging.INFO): + crawl = ScrapeCrawl(root_path) + crawl.crawl() + + assert "but has already been found" in caplog.text + assert "https://example.org/an-example-post/" in crawl.found_pages.values() + + +def test_crawl_non_dir(shared_datadir: Path): + with pytest.raises(ValueError, match="is not a directory"): + ScrapeCrawl(shared_datadir / "notreal") diff --git a/tests/scrape/test_processor.py b/tests/scrape/test_processor.py new file mode 100644 index 0000000..1733f0e --- /dev/null +++ b/tests/scrape/test_processor.py @@ -0,0 +1,65 @@ +import pytest +from bs4 import BeautifulSoup +from wpextract.scrape.processor import ( + _is_url_valid, + extract_self_url, + get_link_canonical, + get_og_url, +) + + +@pytest.mark.parametrize( + ("url", "expected"), + [ + ("https://example.org/", True), + ("http://example.org/", True), + ("/", False), + ("notaurl", False), + ], +) +def test_url_valid(url, expected): + assert _is_url_valid(url) == expected + + +@pytest.mark.parametrize( + ("file", "exp_out"), + [ + ("link_canonical.html", "https://example.org/page_canon/"), + ("link_canonical_no_href.html", None), + ("link_canonical_empty_href.html", None), + ("no_head.html", None), + ("og_url.html", None), + ], +) +def test_get_link_canonical(datadir, file, exp_out): + soup = BeautifulSoup((datadir / file).read_text(), "lxml") + assert get_link_canonical(soup) == exp_out + + +@pytest.mark.parametrize( + ("file", "exp_out"), + [ + ("og_url.html", "https://example.org/page_og/"), + ("og_url_no_content.html", None), + ("og_url_empty_content.html", None), + ("no_head.html", None), + ("link_canonical.html", None), + ], +) +def test_get_og_url(datadir, file, exp_out): + soup = BeautifulSoup((datadir / file).read_text(), "lxml") + assert get_og_url(soup) == exp_out + + +@pytest.mark.parametrize( + ("file", "exp_out"), + [ + ("link_canonical.html", "https://example.org/page_canon/"), + ("og_url.html", "https://example.org/page_og/"), + ("self_url_both.html", "https://example.org/page_canon/"), + ("no_self_url.html", None), + ], +) +def test_extract_self_url(datadir, file, exp_out): + soup = BeautifulSoup((datadir / file).read_text(), "lxml") + assert extract_self_url(soup) == exp_out diff --git a/tests/scrape/test_processor/link_canonical.html b/tests/scrape/test_processor/link_canonical.html new file mode 100644 index 0000000..5408e7f --- /dev/null +++ b/tests/scrape/test_processor/link_canonical.html @@ -0,0 +1,10 @@ + + + + Test Document + + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_processor/link_canonical_empty_href.html b/tests/scrape/test_processor/link_canonical_empty_href.html new file mode 100644 index 0000000..37f9b6b --- /dev/null +++ b/tests/scrape/test_processor/link_canonical_empty_href.html @@ -0,0 +1,10 @@ + + + + Test Document + + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_processor/link_canonical_no_href.html b/tests/scrape/test_processor/link_canonical_no_href.html new file mode 100644 index 0000000..ce095fd --- /dev/null +++ b/tests/scrape/test_processor/link_canonical_no_href.html @@ -0,0 +1,10 @@ + + + + Test Document + + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_processor/no_head.html b/tests/scrape/test_processor/no_head.html new file mode 100644 index 0000000..a7af195 --- /dev/null +++ b/tests/scrape/test_processor/no_head.html @@ -0,0 +1,6 @@ + + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_processor/no_self_url.html b/tests/scrape/test_processor/no_self_url.html new file mode 100644 index 0000000..152ee7f --- /dev/null +++ b/tests/scrape/test_processor/no_self_url.html @@ -0,0 +1,9 @@ + + + + Test Document + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_processor/og_url.html b/tests/scrape/test_processor/og_url.html new file mode 100644 index 0000000..b6f0f60 --- /dev/null +++ b/tests/scrape/test_processor/og_url.html @@ -0,0 +1,10 @@ + + + + Test Document + + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_processor/og_url_empty_content.html b/tests/scrape/test_processor/og_url_empty_content.html new file mode 100644 index 0000000..27068b4 --- /dev/null +++ b/tests/scrape/test_processor/og_url_empty_content.html @@ -0,0 +1,10 @@ + + + + Test Document + + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_processor/og_url_no_content.html b/tests/scrape/test_processor/og_url_no_content.html new file mode 100644 index 0000000..a5ae81c --- /dev/null +++ b/tests/scrape/test_processor/og_url_no_content.html @@ -0,0 +1,10 @@ + + + + Test Document + + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_processor/self_url_both.html b/tests/scrape/test_processor/self_url_both.html new file mode 100644 index 0000000..0404d7a --- /dev/null +++ b/tests/scrape/test_processor/self_url_both.html @@ -0,0 +1,11 @@ + + + + Test Document + + + + +

test document

+ + \ No newline at end of file diff --git a/tests/scrape/test_scrape.py b/tests/scrape/test_scrape.py new file mode 100644 index 0000000..b80e083 --- /dev/null +++ b/tests/scrape/test_scrape.py @@ -0,0 +1,16 @@ +from wpextract.scrape.scrape import load_scrape + + +def test_load_scrape(shared_datadir): + scrape_urls_files = { + "https://example.org/an-example-post/": shared_datadir + / "scrape" + / "an-example-post" + / "index.html", + } + + soup = load_scrape(scrape_urls_files, "https://example.org/an-example-post/") + assert soup.title.string == "An Example Post - example.org" + + not_found = load_scrape(scrape_urls_files, "https://example.org/not-found/") + assert not_found is None