more tests for download/parse/extract

GateNLP · Aug 8, 2024 · 0610aa7 · 0610aa7
1 parent a53b9d9
commit 0610aa7
Show file tree

Hide file tree

Showing 29 changed files with 470 additions and 19 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## 1.1.0 (upcoming)
+
+**Fixes**
+
+- Fixed the scrape crawling step crashing if a page didn't have a canonical link or `og:url` meta tag
+- Fixed the scrape crawling not correctly recognising when duplicate URLs were encountered. Previously duplicates would be included, but only one would be used. Now, they will be correctly logged. As a result of this change, the `SCRAPE_CRAWL_VERSION` has been incremented, meaning running extraction on a scrape will cause it to be re-crawled.
+
 ## 1.0.3 (2024-08-06)
 
 **Changes**

diff --git a/src/wpextract/scrape/crawler.py b/src/wpextract/scrape/crawler.py
@@ -8,7 +8,7 @@
 from wpextract.scrape.processor import extract_self_url, self_url_strainer
 
 # Increment to invalidate old caches
-SCRAPE_CRAWL_VERSION = 1
+SCRAPE_CRAWL_VERSION = 2
 
 
 class ScrapeCrawl:
@@ -28,7 +28,7 @@ class ScrapeCrawl:
     """
 
     found_pages: dict[str, str]
-    failed_docs: list[Path]
+    failed_docs: list[str]
     crawled = False
 
     def __init__(self, root_path: Path) -> None:
@@ -95,14 +95,13 @@ def crawl(self) -> None:
             doc_url = extract_self_url(doc)
 
             if doc_url is None:
-                self.failed_docs.append(path)
-                logging.warning(f'Failed to find self-URL in doc "{path}"')
+                self.failed_docs.append(relative_path)
+                logging.warning(f'Failed to find self-URL in doc "{relative_path}"')
                 continue
 
-            if doc_url in self.found_pages:
+            if doc_url in self.found_pages.values():
                 logging.info(
-                    f"URL {doc_url} retrieved for {relative_path}, but has already been"
-                    f"found for {self.found_pages[relative_path]}"
+                    f"URL {doc_url} retrieved for {relative_path}, but has already been found"
                 )
                 continue
 

diff --git a/tests/download/test_downloader.py b/tests/download/test_downloader.py
@@ -25,12 +25,6 @@ def _fake_api_return():
     return [{"id": idx, "title": "dummy return"} for idx in range(20)]
 
 
-def _export_method(datatype):
-    if datatype == "comments":
-        return "export_comments_interactive"
-    return f"export_{datatype}"
-
-
 def _mocked_exporter(mocker, datatype):
     cls = "wpextract.download.exporter.Exporter."
     if datatype == "comments":

diff --git a/tests/extractors/test_io.py b/tests/extractors/test_io.py
@@ -1,6 +1,10 @@
+import numpy as np
 import pandas as pd
+import pytest
 from helpers.df import ordered_col
 from wpextract.extractors.io import (
+    _remove_nan,
+    _set_nested_keys,
     df_denormalize_to_dict,
     export_df,
     load_df,
@@ -30,7 +34,19 @@ def test_load_df(datadir):
     assert df["nested1.nested2"].equals(ordered_col(["value1", "value2"]))
 
 
-def test_df_denormalize(datadir):
+def test_load_df_from_path_doesnt_exist(datadir):
+    df = load_df(datadir / "notreal.json")
+
+    assert df is None
+
+
+def test_load_df_empty(datadir):
+    df = load_df(datadir / "empty.json")
+
+    assert df is None
+
+
+def test_df_denormalize():
     df = pd.DataFrame(
         [("a", "b"), ("c", "d")], columns=["one", "two.three"], index=[1, 2]
     )
@@ -51,3 +67,33 @@ def test_export_df(datadir, tmp_path):
 
     out_loaded = load_df(out_path)
     assert df.equals(out_loaded)
+
+
+def test_export_df_none(tmp_path):
+    out_path = tmp_path / "out.json"
+    export_df(None, out_path)
+
+    assert out_path.exists()
+
+    assert out_path.read_text() == "[]"
+
+
+def test_set_nested_keys():
+    res = _set_nested_keys({}, ["one", "two", "three"], "value")
+    assert res == {"one": {"two": {"three": "value"}}}
+
+    with pytest.raises(ValueError, match="is already set"):
+        _set_nested_keys({"one": "two"}, ["one", "two", "three"], "value")
+
+
+@pytest.mark.parametrize(
+    ("inp", "exp_out"),
+    [
+        (pd.NA, None),
+        (np.nan, None),
+        ([1, pd.NA, 2, np.nan], [1, None, 2, None]),
+        ({"a": "foo", "b": pd.NA, "c": np.nan}, {"a": "foo", "b": None, "c": None}),
+    ],
+)
+def test_remove_nan(inp, exp_out):
+    assert _remove_nan(inp) == exp_out
diff --git a/tests/extractors/test_io/empty.json b/tests/extractors/test_io/empty.json
@@ -0,0 +1 @@
+[]
diff --git a/tests/parse/test_content.py b/tests/parse/test_content.py
@@ -35,6 +35,17 @@ def test_extract_links(datadir: Path):
     assert external == [Link(text="An external link", href="https://gate.ac.uk")]
 
 
+def test_extract_links_no_href():
+    doc = BeautifulSoup("<a>No href</a>", "lxml")
+
+    internal, external = extract_links(doc, "https://example.org/home")
+
+    assert internal == []
+    assert external == [
+        Link(text="No href", href=None),
+    ]
+
+
 def test_extract_embeds(datadir: Path):
     doc = BeautifulSoup((datadir / "embeds.html").read_text(), "lxml")
 
@@ -70,7 +81,7 @@ def test_extract_images(datadir: Path):
     ]
 
 
-def test_extract_image_without_src(datadir: Path):
+def test_extract_image_without_src():
     doc = BeautifulSoup("<img alt='No src'>", "lxml")
 
     images = extract_images(doc, "https://example.org/home")

diff --git a/...est_pickers/polylang_custom_dropdown.html → ...ations/data/polylang_custom_dropdown.html b/...est_pickers/polylang_custom_dropdown.html → ...ations/data/polylang_custom_dropdown.html
diff --git a/...lations/test_pickers/polylang_widget.html → ...se/translations/data/polylang_widget.html b/...lations/test_pickers/polylang_widget.html → ...se/translations/data/polylang_widget.html
diff --git a/tests/parse/translations/test_extractor.py b/tests/parse/translations/test_extractor.py
@@ -0,0 +1,50 @@
+import logging
+
+import pytest
+from bs4 import BeautifulSoup
+from wpextract.parse.translations import extract_translations
+from wpextract.parse.translations._pickers import PolylangCustomDropdown, PolylangWidget
+
+
+class FaultyDummyPicker(PolylangWidget):
+    def extract(self) -> None:
+        raise self._build_extraction_fail_err(".dummy")
+
+
+@pytest.fixture()
+def parsed_page(shared_datadir):
+    soup = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml")
+    return soup
+
+
+def test_extract_translations(parsed_page):
+    res = extract_translations(
+        parsed_page, "https://example.org/current-lang-page/", translation_pickers=None
+    )
+
+    assert str(res.iloc[0]) == "en-US"
+    assert len(res.iloc[1]) == 1
+
+
+def test_none_matching(caplog, parsed_page):
+    with caplog.at_level(logging.DEBUG):
+        res = extract_translations(
+            parsed_page,
+            "https://example.org/current-lang-page/",
+            translation_pickers=[PolylangCustomDropdown],
+        )
+    assert res.iloc[0] is None
+    assert res.iloc[1] == []
+
+    assert "No translation pickers matched" in caplog.text
+
+
+def test_error_extracting(caplog, parsed_page):
+    res = extract_translations(
+        parsed_page,
+        "https://example.org/current-lang-page/",
+        translation_pickers=[FaultyDummyPicker],
+    )
+
+    assert res.iloc[0] is None
+    assert "but failed to select element with: .dummy" in caplog.text
diff --git a/tests/parse/translations/test_pickers.py b/tests/parse/translations/test_pickers.py
@@ -14,8 +14,10 @@
         (pickers.PolylangCustomDropdown, "polylang_custom_dropdown.html"),
     ],
 )
-def test_picker(datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str):
-    doc = BeautifulSoup((datadir / picker_file).read_text(), "lxml")
+def test_picker(
+    shared_datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str
+):
+    doc = BeautifulSoup((shared_datadir / picker_file).read_text(), "lxml")
 
     picker = picker_cls(doc)
 
@@ -48,8 +50,10 @@ def extract(self) -> None:
 @pytest.mark.parametrize(
     "picker_cls", [FaultyExtractPickerSelect, FaultyExtractPickerSelectOne]
 )
-def test_picker_extract_error(datadir: Path, picker_cls: type[pickers.LangPicker]):
-    doc = BeautifulSoup((datadir / "polylang_widget.html").read_text(), "lxml")
+def test_picker_extract_error(
+    shared_datadir: Path, picker_cls: type[pickers.LangPicker]
+):
+    doc = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml")
 
     picker = picker_cls(doc)
     assert picker.matches()

diff --git a/tests/parse/translations/test_resolver.py b/tests/parse/translations/test_resolver.py
@@ -0,0 +1,23 @@
+from typing import Union
+
+import pytest
+from langcodes import Language
+from wpextract.parse.translations import TranslationLink
+
+
+@pytest.mark.parametrize(
+    ("input_lang", "expected_language"),
+    [
+        ("en", "en"),
+        ("en-GB", "en-GB"),
+        ("fr-FR", "fr-FR"),
+        (Language.get("en-GB"), "en-GB"),
+        ("zho", "zh"),
+    ],
+)
+def test_translation_link_lang(
+    input_lang: Union[str, Language], expected_language: str
+):
+    link = TranslationLink(text=None, href=None, destination=None, lang=input_lang)
+
+    assert str(link.language) == expected_language
diff --git a/tests/scrape/data/diff_version_url_cache.json b/tests/scrape/data/diff_version_url_cache.json
@@ -0,0 +1 @@
+{"found": {}, "failed": [], "version": 1}
diff --git a/tests/scrape/data/expected_url_cache.json b/tests/scrape/data/expected_url_cache.json
@@ -0,0 +1 @@
+{"found": {"fr/an-example-post-translation/index.html": "https://example.org/fr/an-example-post-translation/", "an-example-post/index.html": "https://example.org/an-example-post/", "an-unrelated-post/index.html": "https://example.org/an-unrelated-post/"}, "failed": ["no-self-url.html"], "version": 2}
diff --git a/tests/scrape/data/scrape/an-example-post/index.html b/tests/scrape/data/scrape/an-example-post/index.html
@@ -0,0 +1,19 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
+    />
+    <meta http-equiv="X-UA-Compatible" content="ie=edge" />
+    <title>An Example Post - example.org</title>
+    <link rel="canonical" href="https://example.org/an-example-post/" />
+    <meta name="post_id_for_mock" content="1" />
+  </head>
+  <body>
+    <p>This is an example post</p>
+    <p>It has two paragraphs.</p>
+    <img src="https://example.org/wp-content/uploads/2022/12/test-image.jpg" alt="an image">
+  </body>
+</html>
diff --git a/tests/scrape/data/scrape/an-unrelated-post/index.html b/tests/scrape/data/scrape/an-unrelated-post/index.html
@@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
+    />
+    <meta http-equiv="X-UA-Compatible" content="ie=edge" />
+    <title>An Unrelated Post - example.org</title>
+    <link rel="canonical" href="https://example.org/an-unrelated-post/" />
+    <meta name="post_id_for_mock" content="3" />
+  </head>
+  <body>
+    <p>This is an unrelated post.</p>
+    <p>It has two paragraphs.</p>
+  </body>
+</html>
diff --git a/tests/scrape/data/scrape/fr/an-example-post-translation/index.html b/tests/scrape/data/scrape/fr/an-example-post-translation/index.html
@@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
+    />
+    <meta http-equiv="X-UA-Compatible" content="ie=edge" />
+    <title>An Example Post Translation - example.org</title>
+    <link
+      rel="canonical"
+      href="https://example.org/fr/an-example-post-translation/"
+    />
+    <meta name="post_id_for_mock" content="2" />
+  </head>
+  <body>
+    <p>This is a translation of the post.</p>
+    <p>It has two paragraphs.</p>
+  </body>
+</html>
diff --git a/tests/scrape/data/scrape/no-self-url.html b/tests/scrape/data/scrape/no-self-url.html
@@ -0,0 +1,9 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"found": {"fr/an-example-post-translation/index.html": "https://example.org/fr/an-example-post-translation/", "an-example-post/index.html": "https://example.org/an-example-post/", "an-unrelated-post/index.html": "https://example.org/an-unrelated-post/"}, "failed": ["no-self-url.html"], "version": 2}