Skip to content

Commit

Permalink
more tests for download/parse/extract
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Aug 8, 2024
1 parent a53b9d9 commit 0610aa7
Show file tree
Hide file tree
Showing 29 changed files with 470 additions and 19 deletions.
7 changes: 7 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## 1.1.0 (upcoming)

**Fixes**

- Fixed the scrape crawling step crashing if a page didn't have a canonical link or `og:url` meta tag
- Fixed the scrape crawling not correctly recognising when duplicate URLs were encountered. Previously duplicates would be included, but only one would be used. Now, they will be correctly logged. As a result of this change, the `SCRAPE_CRAWL_VERSION` has been incremented, meaning running extraction on a scrape will cause it to be re-crawled.

## 1.0.3 (2024-08-06)

**Changes**
Expand Down
13 changes: 6 additions & 7 deletions src/wpextract/scrape/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from wpextract.scrape.processor import extract_self_url, self_url_strainer

# Increment to invalidate old caches
SCRAPE_CRAWL_VERSION = 1
SCRAPE_CRAWL_VERSION = 2


class ScrapeCrawl:
Expand All @@ -28,7 +28,7 @@ class ScrapeCrawl:
"""

found_pages: dict[str, str]
failed_docs: list[Path]
failed_docs: list[str]
crawled = False

def __init__(self, root_path: Path) -> None:
Expand Down Expand Up @@ -95,14 +95,13 @@ def crawl(self) -> None:
doc_url = extract_self_url(doc)

if doc_url is None:
self.failed_docs.append(path)
logging.warning(f'Failed to find self-URL in doc "{path}"')
self.failed_docs.append(relative_path)
logging.warning(f'Failed to find self-URL in doc "{relative_path}"')
continue

if doc_url in self.found_pages:
if doc_url in self.found_pages.values():
logging.info(
f"URL {doc_url} retrieved for {relative_path}, but has already been"
f"found for {self.found_pages[relative_path]}"
f"URL {doc_url} retrieved for {relative_path}, but has already been found"
)
continue

Expand Down
6 changes: 0 additions & 6 deletions tests/download/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,6 @@ def _fake_api_return():
return [{"id": idx, "title": "dummy return"} for idx in range(20)]


def _export_method(datatype):
if datatype == "comments":
return "export_comments_interactive"
return f"export_{datatype}"


def _mocked_exporter(mocker, datatype):
cls = "wpextract.download.exporter.Exporter."
if datatype == "comments":
Expand Down
48 changes: 47 additions & 1 deletion tests/extractors/test_io.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import numpy as np
import pandas as pd
import pytest
from helpers.df import ordered_col
from wpextract.extractors.io import (
_remove_nan,
_set_nested_keys,
df_denormalize_to_dict,
export_df,
load_df,
Expand Down Expand Up @@ -30,7 +34,19 @@ def test_load_df(datadir):
assert df["nested1.nested2"].equals(ordered_col(["value1", "value2"]))


def test_df_denormalize(datadir):
def test_load_df_from_path_doesnt_exist(datadir):
df = load_df(datadir / "notreal.json")

assert df is None


def test_load_df_empty(datadir):
df = load_df(datadir / "empty.json")

assert df is None


def test_df_denormalize():
df = pd.DataFrame(
[("a", "b"), ("c", "d")], columns=["one", "two.three"], index=[1, 2]
)
Expand All @@ -51,3 +67,33 @@ def test_export_df(datadir, tmp_path):

out_loaded = load_df(out_path)
assert df.equals(out_loaded)


def test_export_df_none(tmp_path):
out_path = tmp_path / "out.json"
export_df(None, out_path)

assert out_path.exists()

assert out_path.read_text() == "[]"


def test_set_nested_keys():
res = _set_nested_keys({}, ["one", "two", "three"], "value")
assert res == {"one": {"two": {"three": "value"}}}

with pytest.raises(ValueError, match="is already set"):
_set_nested_keys({"one": "two"}, ["one", "two", "three"], "value")


@pytest.mark.parametrize(
("inp", "exp_out"),
[
(pd.NA, None),
(np.nan, None),
([1, pd.NA, 2, np.nan], [1, None, 2, None]),
({"a": "foo", "b": pd.NA, "c": np.nan}, {"a": "foo", "b": None, "c": None}),
],
)
def test_remove_nan(inp, exp_out):
assert _remove_nan(inp) == exp_out
1 change: 1 addition & 0 deletions tests/extractors/test_io/empty.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]
13 changes: 12 additions & 1 deletion tests/parse/test_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ def test_extract_links(datadir: Path):
assert external == [Link(text="An external link", href="https://gate.ac.uk")]


def test_extract_links_no_href():
doc = BeautifulSoup("<a>No href</a>", "lxml")

internal, external = extract_links(doc, "https://example.org/home")

assert internal == []
assert external == [
Link(text="No href", href=None),
]


def test_extract_embeds(datadir: Path):
doc = BeautifulSoup((datadir / "embeds.html").read_text(), "lxml")

Expand Down Expand Up @@ -70,7 +81,7 @@ def test_extract_images(datadir: Path):
]


def test_extract_image_without_src(datadir: Path):
def test_extract_image_without_src():
doc = BeautifulSoup("<img alt='No src'>", "lxml")

images = extract_images(doc, "https://example.org/home")
Expand Down
50 changes: 50 additions & 0 deletions tests/parse/translations/test_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging

import pytest
from bs4 import BeautifulSoup
from wpextract.parse.translations import extract_translations
from wpextract.parse.translations._pickers import PolylangCustomDropdown, PolylangWidget


class FaultyDummyPicker(PolylangWidget):
def extract(self) -> None:
raise self._build_extraction_fail_err(".dummy")


@pytest.fixture()
def parsed_page(shared_datadir):
soup = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml")
return soup


def test_extract_translations(parsed_page):
res = extract_translations(
parsed_page, "https://example.org/current-lang-page/", translation_pickers=None
)

assert str(res.iloc[0]) == "en-US"
assert len(res.iloc[1]) == 1


def test_none_matching(caplog, parsed_page):
with caplog.at_level(logging.DEBUG):
res = extract_translations(
parsed_page,
"https://example.org/current-lang-page/",
translation_pickers=[PolylangCustomDropdown],
)
assert res.iloc[0] is None
assert res.iloc[1] == []

assert "No translation pickers matched" in caplog.text


def test_error_extracting(caplog, parsed_page):
res = extract_translations(
parsed_page,
"https://example.org/current-lang-page/",
translation_pickers=[FaultyDummyPicker],
)

assert res.iloc[0] is None
assert "but failed to select element with: .dummy" in caplog.text
12 changes: 8 additions & 4 deletions tests/parse/translations/test_pickers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
(pickers.PolylangCustomDropdown, "polylang_custom_dropdown.html"),
],
)
def test_picker(datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str):
doc = BeautifulSoup((datadir / picker_file).read_text(), "lxml")
def test_picker(
shared_datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str
):
doc = BeautifulSoup((shared_datadir / picker_file).read_text(), "lxml")

picker = picker_cls(doc)

Expand Down Expand Up @@ -48,8 +50,10 @@ def extract(self) -> None:
@pytest.mark.parametrize(
"picker_cls", [FaultyExtractPickerSelect, FaultyExtractPickerSelectOne]
)
def test_picker_extract_error(datadir: Path, picker_cls: type[pickers.LangPicker]):
doc = BeautifulSoup((datadir / "polylang_widget.html").read_text(), "lxml")
def test_picker_extract_error(
shared_datadir: Path, picker_cls: type[pickers.LangPicker]
):
doc = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml")

picker = picker_cls(doc)
assert picker.matches()
Expand Down
23 changes: 23 additions & 0 deletions tests/parse/translations/test_resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Union

import pytest
from langcodes import Language
from wpextract.parse.translations import TranslationLink


@pytest.mark.parametrize(
("input_lang", "expected_language"),
[
("en", "en"),
("en-GB", "en-GB"),
("fr-FR", "fr-FR"),
(Language.get("en-GB"), "en-GB"),
("zho", "zh"),
],
)
def test_translation_link_lang(
input_lang: Union[str, Language], expected_language: str
):
link = TranslationLink(text=None, href=None, destination=None, lang=input_lang)

assert str(link.language) == expected_language
1 change: 1 addition & 0 deletions tests/scrape/data/diff_version_url_cache.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"found": {}, "failed": [], "version": 1}
1 change: 1 addition & 0 deletions tests/scrape/data/expected_url_cache.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"found": {"fr/an-example-post-translation/index.html": "https://example.org/fr/an-example-post-translation/", "an-example-post/index.html": "https://example.org/an-example-post/", "an-unrelated-post/index.html": "https://example.org/an-unrelated-post/"}, "failed": ["no-self-url.html"], "version": 2}
19 changes: 19 additions & 0 deletions tests/scrape/data/scrape/an-example-post/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta
name="viewport"
content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
/>
<meta http-equiv="X-UA-Compatible" content="ie=edge" />
<title>An Example Post - example.org</title>
<link rel="canonical" href="https://example.org/an-example-post/" />
<meta name="post_id_for_mock" content="1" />
</head>
<body>
<p>This is an example post</p>
<p>It has two paragraphs.</p>
<img src="https://example.org/wp-content/uploads/2022/12/test-image.jpg" alt="an image">
</body>
</html>
18 changes: 18 additions & 0 deletions tests/scrape/data/scrape/an-unrelated-post/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta
name="viewport"
content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
/>
<meta http-equiv="X-UA-Compatible" content="ie=edge" />
<title>An Unrelated Post - example.org</title>
<link rel="canonical" href="https://example.org/an-unrelated-post/" />
<meta name="post_id_for_mock" content="3" />
</head>
<body>
<p>This is an unrelated post.</p>
<p>It has two paragraphs.</p>
</body>
</html>
21 changes: 21 additions & 0 deletions tests/scrape/data/scrape/fr/an-example-post-translation/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta
name="viewport"
content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
/>
<meta http-equiv="X-UA-Compatible" content="ie=edge" />
<title>An Example Post Translation - example.org</title>
<link
rel="canonical"
href="https://example.org/fr/an-example-post-translation/"
/>
<meta name="post_id_for_mock" content="2" />
</head>
<body>
<p>This is a translation of the post.</p>
<p>It has two paragraphs.</p>
</body>
</html>
9 changes: 9 additions & 0 deletions tests/scrape/data/scrape/no-self-url.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<!doctype html>
<html>
<head>
<title>Test Document</title>
</head>
<body>
<h1>test document</h1>
</body>
</html>
Loading

0 comments on commit 0610aa7

Please sign in to comment.