From 0610aa78f935c717385f75aeaa8318a7ed36a29b Mon Sep 17 00:00:00 2001
From: Freddy Heppell <freddy@freddyheppell.com>
Date: Thu, 8 Aug 2024 17:42:03 +0100
Subject: [PATCH] more tests for download/parse/extract

---
 docs/changelog.md                             |  7 ++
 src/wpextract/scrape/crawler.py               | 13 ++-
 tests/download/test_downloader.py             |  6 --
 tests/extractors/test_io.py                   | 48 ++++++++++-
 tests/extractors/test_io/empty.json           |  1 +
 tests/parse/test_content.py                   | 13 ++-
 .../polylang_custom_dropdown.html             |  0
 .../polylang_widget.html                      |  0
 tests/parse/translations/test_extractor.py    | 50 ++++++++++++
 tests/parse/translations/test_pickers.py      | 12 ++-
 tests/parse/translations/test_resolver.py     | 23 ++++++
 tests/scrape/data/diff_version_url_cache.json |  1 +
 tests/scrape/data/expected_url_cache.json     |  1 +
 .../data/scrape/an-example-post/index.html    | 19 +++++
 .../data/scrape/an-unrelated-post/index.html  | 18 +++++
 .../fr/an-example-post-translation/index.html | 21 +++++
 tests/scrape/data/scrape/no-self-url.html     |  9 +++
 tests/scrape/test_crawler.py                  | 80 +++++++++++++++++++
 tests/scrape/test_processor.py                | 65 +++++++++++++++
 .../scrape/test_processor/link_canonical.html | 10 +++
 .../link_canonical_empty_href.html            | 10 +++
 .../link_canonical_no_href.html               | 10 +++
 tests/scrape/test_processor/no_head.html      |  6 ++
 tests/scrape/test_processor/no_self_url.html  |  9 +++
 tests/scrape/test_processor/og_url.html       | 10 +++
 .../test_processor/og_url_empty_content.html  | 10 +++
 .../test_processor/og_url_no_content.html     | 10 +++
 .../scrape/test_processor/self_url_both.html  | 11 +++
 tests/scrape/test_scrape.py                   | 16 ++++
 29 files changed, 470 insertions(+), 19 deletions(-)
 create mode 100644 tests/extractors/test_io/empty.json
 rename tests/parse/translations/{test_pickers => data}/polylang_custom_dropdown.html (100%)
 rename tests/parse/translations/{test_pickers => data}/polylang_widget.html (100%)
 create mode 100644 tests/parse/translations/test_extractor.py
 create mode 100644 tests/parse/translations/test_resolver.py
 create mode 100644 tests/scrape/data/diff_version_url_cache.json
 create mode 100644 tests/scrape/data/expected_url_cache.json
 create mode 100644 tests/scrape/data/scrape/an-example-post/index.html
 create mode 100644 tests/scrape/data/scrape/an-unrelated-post/index.html
 create mode 100644 tests/scrape/data/scrape/fr/an-example-post-translation/index.html
 create mode 100644 tests/scrape/data/scrape/no-self-url.html
 create mode 100644 tests/scrape/test_crawler.py
 create mode 100644 tests/scrape/test_processor.py
 create mode 100644 tests/scrape/test_processor/link_canonical.html
 create mode 100644 tests/scrape/test_processor/link_canonical_empty_href.html
 create mode 100644 tests/scrape/test_processor/link_canonical_no_href.html
 create mode 100644 tests/scrape/test_processor/no_head.html
 create mode 100644 tests/scrape/test_processor/no_self_url.html
 create mode 100644 tests/scrape/test_processor/og_url.html
 create mode 100644 tests/scrape/test_processor/og_url_empty_content.html
 create mode 100644 tests/scrape/test_processor/og_url_no_content.html
 create mode 100644 tests/scrape/test_processor/self_url_both.html
 create mode 100644 tests/scrape/test_scrape.py

diff --git a/docs/changelog.md b/docs/changelog.md
index 5c03422..936ef78 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## 1.1.0 (upcoming)
+
+**Fixes**
+
+- Fixed the scrape crawling step crashing if a page didn't have a canonical link or `og:url` meta tag
+- Fixed the scrape crawling not correctly recognising when duplicate URLs were encountered. Previously duplicates would be included, but only one would be used. Now, they will be correctly logged. As a result of this change, the `SCRAPE_CRAWL_VERSION` has been incremented, meaning running extraction on a scrape will cause it to be re-crawled.
+
 ## 1.0.3 (2024-08-06)
 
 **Changes**
diff --git a/src/wpextract/scrape/crawler.py b/src/wpextract/scrape/crawler.py
index 3450c3a..cf3a547 100644
--- a/src/wpextract/scrape/crawler.py
+++ b/src/wpextract/scrape/crawler.py
@@ -8,7 +8,7 @@
 from wpextract.scrape.processor import extract_self_url, self_url_strainer
 
 # Increment to invalidate old caches
-SCRAPE_CRAWL_VERSION = 1
+SCRAPE_CRAWL_VERSION = 2
 
 
 class ScrapeCrawl:
@@ -28,7 +28,7 @@ class ScrapeCrawl:
     """
 
     found_pages: dict[str, str]
-    failed_docs: list[Path]
+    failed_docs: list[str]
     crawled = False
 
     def __init__(self, root_path: Path) -> None:
@@ -95,14 +95,13 @@ def crawl(self) -> None:
             doc_url = extract_self_url(doc)
 
             if doc_url is None:
-                self.failed_docs.append(path)
-                logging.warning(f'Failed to find self-URL in doc "{path}"')
+                self.failed_docs.append(relative_path)
+                logging.warning(f'Failed to find self-URL in doc "{relative_path}"')
                 continue
 
-            if doc_url in self.found_pages:
+            if doc_url in self.found_pages.values():
                 logging.info(
-                    f"URL {doc_url} retrieved for {relative_path}, but has already been"
-                    f"found for {self.found_pages[relative_path]}"
+                    f"URL {doc_url} retrieved for {relative_path}, but has already been found"
                 )
                 continue
 
diff --git a/tests/download/test_downloader.py b/tests/download/test_downloader.py
index 79f1a3d..b2b493c 100644
--- a/tests/download/test_downloader.py
+++ b/tests/download/test_downloader.py
@@ -25,12 +25,6 @@ def _fake_api_return():
     return [{"id": idx, "title": "dummy return"} for idx in range(20)]
 
 
-def _export_method(datatype):
-    if datatype == "comments":
-        return "export_comments_interactive"
-    return f"export_{datatype}"
-
-
 def _mocked_exporter(mocker, datatype):
     cls = "wpextract.download.exporter.Exporter."
     if datatype == "comments":
diff --git a/tests/extractors/test_io.py b/tests/extractors/test_io.py
index 68d7e6a..d2c78e4 100644
--- a/tests/extractors/test_io.py
+++ b/tests/extractors/test_io.py
@@ -1,6 +1,10 @@
+import numpy as np
 import pandas as pd
+import pytest
 from helpers.df import ordered_col
 from wpextract.extractors.io import (
+    _remove_nan,
+    _set_nested_keys,
     df_denormalize_to_dict,
     export_df,
     load_df,
@@ -30,7 +34,19 @@ def test_load_df(datadir):
     assert df["nested1.nested2"].equals(ordered_col(["value1", "value2"]))
 
 
-def test_df_denormalize(datadir):
+def test_load_df_from_path_doesnt_exist(datadir):
+    df = load_df(datadir / "notreal.json")
+
+    assert df is None
+
+
+def test_load_df_empty(datadir):
+    df = load_df(datadir / "empty.json")
+
+    assert df is None
+
+
+def test_df_denormalize():
     df = pd.DataFrame(
         [("a", "b"), ("c", "d")], columns=["one", "two.three"], index=[1, 2]
     )
@@ -51,3 +67,33 @@ def test_export_df(datadir, tmp_path):
 
     out_loaded = load_df(out_path)
     assert df.equals(out_loaded)
+
+
+def test_export_df_none(tmp_path):
+    out_path = tmp_path / "out.json"
+    export_df(None, out_path)
+
+    assert out_path.exists()
+
+    assert out_path.read_text() == "[]"
+
+
+def test_set_nested_keys():
+    res = _set_nested_keys({}, ["one", "two", "three"], "value")
+    assert res == {"one": {"two": {"three": "value"}}}
+
+    with pytest.raises(ValueError, match="is already set"):
+        _set_nested_keys({"one": "two"}, ["one", "two", "three"], "value")
+
+
+@pytest.mark.parametrize(
+    ("inp", "exp_out"),
+    [
+        (pd.NA, None),
+        (np.nan, None),
+        ([1, pd.NA, 2, np.nan], [1, None, 2, None]),
+        ({"a": "foo", "b": pd.NA, "c": np.nan}, {"a": "foo", "b": None, "c": None}),
+    ],
+)
+def test_remove_nan(inp, exp_out):
+    assert _remove_nan(inp) == exp_out
diff --git a/tests/extractors/test_io/empty.json b/tests/extractors/test_io/empty.json
new file mode 100644
index 0000000..0637a08
--- /dev/null
+++ b/tests/extractors/test_io/empty.json
@@ -0,0 +1 @@
+[]
\ No newline at end of file
diff --git a/tests/parse/test_content.py b/tests/parse/test_content.py
index 2e4d370..5e1e19a 100644
--- a/tests/parse/test_content.py
+++ b/tests/parse/test_content.py
@@ -35,6 +35,17 @@ def test_extract_links(datadir: Path):
     assert external == [Link(text="An external link", href="https://gate.ac.uk")]
 
 
+def test_extract_links_no_href():
+    doc = BeautifulSoup("<a>No href</a>", "lxml")
+
+    internal, external = extract_links(doc, "https://example.org/home")
+
+    assert internal == []
+    assert external == [
+        Link(text="No href", href=None),
+    ]
+
+
 def test_extract_embeds(datadir: Path):
     doc = BeautifulSoup((datadir / "embeds.html").read_text(), "lxml")
 
@@ -70,7 +81,7 @@ def test_extract_images(datadir: Path):
     ]
 
 
-def test_extract_image_without_src(datadir: Path):
+def test_extract_image_without_src():
     doc = BeautifulSoup("<img alt='No src'>", "lxml")
 
     images = extract_images(doc, "https://example.org/home")
diff --git a/tests/parse/translations/test_pickers/polylang_custom_dropdown.html b/tests/parse/translations/data/polylang_custom_dropdown.html
similarity index 100%
rename from tests/parse/translations/test_pickers/polylang_custom_dropdown.html
rename to tests/parse/translations/data/polylang_custom_dropdown.html
diff --git a/tests/parse/translations/test_pickers/polylang_widget.html b/tests/parse/translations/data/polylang_widget.html
similarity index 100%
rename from tests/parse/translations/test_pickers/polylang_widget.html
rename to tests/parse/translations/data/polylang_widget.html
diff --git a/tests/parse/translations/test_extractor.py b/tests/parse/translations/test_extractor.py
new file mode 100644
index 0000000..f3d1480
--- /dev/null
+++ b/tests/parse/translations/test_extractor.py
@@ -0,0 +1,50 @@
+import logging
+
+import pytest
+from bs4 import BeautifulSoup
+from wpextract.parse.translations import extract_translations
+from wpextract.parse.translations._pickers import PolylangCustomDropdown, PolylangWidget
+
+
+class FaultyDummyPicker(PolylangWidget):
+    def extract(self) -> None:
+        raise self._build_extraction_fail_err(".dummy")
+
+
+@pytest.fixture()
+def parsed_page(shared_datadir):
+    soup = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml")
+    return soup
+
+
+def test_extract_translations(parsed_page):
+    res = extract_translations(
+        parsed_page, "https://example.org/current-lang-page/", translation_pickers=None
+    )
+
+    assert str(res.iloc[0]) == "en-US"
+    assert len(res.iloc[1]) == 1
+
+
+def test_none_matching(caplog, parsed_page):
+    with caplog.at_level(logging.DEBUG):
+        res = extract_translations(
+            parsed_page,
+            "https://example.org/current-lang-page/",
+            translation_pickers=[PolylangCustomDropdown],
+        )
+    assert res.iloc[0] is None
+    assert res.iloc[1] == []
+
+    assert "No translation pickers matched" in caplog.text
+
+
+def test_error_extracting(caplog, parsed_page):
+    res = extract_translations(
+        parsed_page,
+        "https://example.org/current-lang-page/",
+        translation_pickers=[FaultyDummyPicker],
+    )
+
+    assert res.iloc[0] is None
+    assert "but failed to select element with: .dummy" in caplog.text
diff --git a/tests/parse/translations/test_pickers.py b/tests/parse/translations/test_pickers.py
index 403b714..c611c6d 100644
--- a/tests/parse/translations/test_pickers.py
+++ b/tests/parse/translations/test_pickers.py
@@ -14,8 +14,10 @@
         (pickers.PolylangCustomDropdown, "polylang_custom_dropdown.html"),
     ],
 )
-def test_picker(datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str):
-    doc = BeautifulSoup((datadir / picker_file).read_text(), "lxml")
+def test_picker(
+    shared_datadir: Path, picker_cls: type[pickers.LangPicker], picker_file: str
+):
+    doc = BeautifulSoup((shared_datadir / picker_file).read_text(), "lxml")
 
     picker = picker_cls(doc)
 
@@ -48,8 +50,10 @@ def extract(self) -> None:
 @pytest.mark.parametrize(
     "picker_cls", [FaultyExtractPickerSelect, FaultyExtractPickerSelectOne]
 )
-def test_picker_extract_error(datadir: Path, picker_cls: type[pickers.LangPicker]):
-    doc = BeautifulSoup((datadir / "polylang_widget.html").read_text(), "lxml")
+def test_picker_extract_error(
+    shared_datadir: Path, picker_cls: type[pickers.LangPicker]
+):
+    doc = BeautifulSoup((shared_datadir / "polylang_widget.html").read_text(), "lxml")
 
     picker = picker_cls(doc)
     assert picker.matches()
diff --git a/tests/parse/translations/test_resolver.py b/tests/parse/translations/test_resolver.py
new file mode 100644
index 0000000..7a81c5c
--- /dev/null
+++ b/tests/parse/translations/test_resolver.py
@@ -0,0 +1,23 @@
+from typing import Union
+
+import pytest
+from langcodes import Language
+from wpextract.parse.translations import TranslationLink
+
+
+@pytest.mark.parametrize(
+    ("input_lang", "expected_language"),
+    [
+        ("en", "en"),
+        ("en-GB", "en-GB"),
+        ("fr-FR", "fr-FR"),
+        (Language.get("en-GB"), "en-GB"),
+        ("zho", "zh"),
+    ],
+)
+def test_translation_link_lang(
+    input_lang: Union[str, Language], expected_language: str
+):
+    link = TranslationLink(text=None, href=None, destination=None, lang=input_lang)
+
+    assert str(link.language) == expected_language
diff --git a/tests/scrape/data/diff_version_url_cache.json b/tests/scrape/data/diff_version_url_cache.json
new file mode 100644
index 0000000..405e1e6
--- /dev/null
+++ b/tests/scrape/data/diff_version_url_cache.json
@@ -0,0 +1 @@
+{"found": {}, "failed": [], "version": 1}
\ No newline at end of file
diff --git a/tests/scrape/data/expected_url_cache.json b/tests/scrape/data/expected_url_cache.json
new file mode 100644
index 0000000..4539f30
--- /dev/null
+++ b/tests/scrape/data/expected_url_cache.json
@@ -0,0 +1 @@
+{"found": {"fr/an-example-post-translation/index.html": "https://example.org/fr/an-example-post-translation/", "an-example-post/index.html": "https://example.org/an-example-post/", "an-unrelated-post/index.html": "https://example.org/an-unrelated-post/"}, "failed": ["no-self-url.html"], "version": 2}
\ No newline at end of file
diff --git a/tests/scrape/data/scrape/an-example-post/index.html b/tests/scrape/data/scrape/an-example-post/index.html
new file mode 100644
index 0000000..c641d63
--- /dev/null
+++ b/tests/scrape/data/scrape/an-example-post/index.html
@@ -0,0 +1,19 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
+    />
+    <meta http-equiv="X-UA-Compatible" content="ie=edge" />
+    <title>An Example Post - example.org</title>
+    <link rel="canonical" href="https://example.org/an-example-post/" />
+    <meta name="post_id_for_mock" content="1" />
+  </head>
+  <body>
+    <p>This is an example post</p>
+    <p>It has two paragraphs.</p>
+    <img src="https://example.org/wp-content/uploads/2022/12/test-image.jpg" alt="an image">
+  </body>
+</html>
diff --git a/tests/scrape/data/scrape/an-unrelated-post/index.html b/tests/scrape/data/scrape/an-unrelated-post/index.html
new file mode 100644
index 0000000..f55ffa8
--- /dev/null
+++ b/tests/scrape/data/scrape/an-unrelated-post/index.html
@@ -0,0 +1,18 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
+    />
+    <meta http-equiv="X-UA-Compatible" content="ie=edge" />
+    <title>An Unrelated Post - example.org</title>
+    <link rel="canonical" href="https://example.org/an-unrelated-post/" />
+    <meta name="post_id_for_mock" content="3" />
+  </head>
+  <body>
+    <p>This is an unrelated post.</p>
+    <p>It has two paragraphs.</p>
+  </body>
+</html>
diff --git a/tests/scrape/data/scrape/fr/an-example-post-translation/index.html b/tests/scrape/data/scrape/fr/an-example-post-translation/index.html
new file mode 100644
index 0000000..2a1dc88
--- /dev/null
+++ b/tests/scrape/data/scrape/fr/an-example-post-translation/index.html
@@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta
+      name="viewport"
+      content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
+    />
+    <meta http-equiv="X-UA-Compatible" content="ie=edge" />
+    <title>An Example Post Translation - example.org</title>
+    <link
+      rel="canonical"
+      href="https://example.org/fr/an-example-post-translation/"
+    />
+    <meta name="post_id_for_mock" content="2" />
+  </head>
+  <body>
+    <p>This is a translation of the post.</p>
+    <p>It has two paragraphs.</p>
+  </body>
+</html>
diff --git a/tests/scrape/data/scrape/no-self-url.html b/tests/scrape/data/scrape/no-self-url.html
new file mode 100644
index 0000000..152ee7f
--- /dev/null
+++ b/tests/scrape/data/scrape/no-self-url.html
@@ -0,0 +1,9 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_crawler.py b/tests/scrape/test_crawler.py
new file mode 100644
index 0000000..8dc06af
--- /dev/null
+++ b/tests/scrape/test_crawler.py
@@ -0,0 +1,80 @@
+import logging
+from pathlib import Path
+
+import pytest
+from wpextract.scrape.crawler import ScrapeCrawl
+
+EXPECTED_PAGES = {
+    "an-example-post/index.html": "https://example.org/an-example-post/",
+    "an-unrelated-post/index.html": "https://example.org/an-unrelated-post/",
+    "fr/an-example-post-translation/index.html": "https://example.org/fr/an-example-post-translation/",
+}
+
+
+def expected_link_abs_path(root_dir: Path):
+    root_path = str(root_dir.resolve())
+    return {link: Path(root_path + "/" + path) for path, link in EXPECTED_PAGES.items()}
+
+
+def test_scrape_crawl(shared_datadir):
+    root_path = shared_datadir / "scrape"
+
+    crawl = ScrapeCrawl(root_path)
+
+    crawl.crawl()
+
+    assert crawl.found_pages == EXPECTED_PAGES
+
+    assert crawl.failed_docs == ["no-self-url.html"]
+
+    assert crawl.get_link_abs_path() == expected_link_abs_path(root_path)
+
+    assert (root_path / "url_cache.json").is_file()
+    assert (root_path / "url_cache.json").read_text() == (
+        shared_datadir / "expected_url_cache.json"
+    ).read_text()
+
+
+def test_recrawl_different_version(caplog, shared_datadir: Path):
+    root_path = shared_datadir / "scrape"
+    (shared_datadir / "diff_version_url_cache.json").rename(
+        root_path / "url_cache.json"
+    )
+
+    with caplog.at_level(logging.INFO):
+        crawl = ScrapeCrawl(root_path)
+        crawl.crawl()
+
+    assert crawl.found_pages != {}
+    assert "out of date" in caplog.text
+
+
+def test_scrape_crawl_cached(shared_datadir: Path):
+    root_path = shared_datadir / "scrape"
+    crawl = ScrapeCrawl(root_path)
+    crawl.crawl()
+
+    (root_path / "an-example-post/index.html").unlink()
+
+    crawl2 = ScrapeCrawl(root_path)
+    crawl2.crawl()
+    assert crawl2.found_pages == EXPECTED_PAGES
+
+
+def test_duplicate_url(caplog, shared_datadir: Path):
+    root_path = shared_datadir / "scrape"
+    (root_path / "dupe.html").write_text(
+        (root_path / "an-example-post/index.html").read_text()
+    )
+
+    with caplog.at_level(logging.INFO):
+        crawl = ScrapeCrawl(root_path)
+        crawl.crawl()
+
+    assert "but has already been found" in caplog.text
+    assert "https://example.org/an-example-post/" in crawl.found_pages.values()
+
+
+def test_crawl_non_dir(shared_datadir: Path):
+    with pytest.raises(ValueError, match="is not a directory"):
+        ScrapeCrawl(shared_datadir / "notreal")
diff --git a/tests/scrape/test_processor.py b/tests/scrape/test_processor.py
new file mode 100644
index 0000000..1733f0e
--- /dev/null
+++ b/tests/scrape/test_processor.py
@@ -0,0 +1,65 @@
+import pytest
+from bs4 import BeautifulSoup
+from wpextract.scrape.processor import (
+    _is_url_valid,
+    extract_self_url,
+    get_link_canonical,
+    get_og_url,
+)
+
+
+@pytest.mark.parametrize(
+    ("url", "expected"),
+    [
+        ("https://example.org/", True),
+        ("http://example.org/", True),
+        ("/", False),
+        ("notaurl", False),
+    ],
+)
+def test_url_valid(url, expected):
+    assert _is_url_valid(url) == expected
+
+
+@pytest.mark.parametrize(
+    ("file", "exp_out"),
+    [
+        ("link_canonical.html", "https://example.org/page_canon/"),
+        ("link_canonical_no_href.html", None),
+        ("link_canonical_empty_href.html", None),
+        ("no_head.html", None),
+        ("og_url.html", None),
+    ],
+)
+def test_get_link_canonical(datadir, file, exp_out):
+    soup = BeautifulSoup((datadir / file).read_text(), "lxml")
+    assert get_link_canonical(soup) == exp_out
+
+
+@pytest.mark.parametrize(
+    ("file", "exp_out"),
+    [
+        ("og_url.html", "https://example.org/page_og/"),
+        ("og_url_no_content.html", None),
+        ("og_url_empty_content.html", None),
+        ("no_head.html", None),
+        ("link_canonical.html", None),
+    ],
+)
+def test_get_og_url(datadir, file, exp_out):
+    soup = BeautifulSoup((datadir / file).read_text(), "lxml")
+    assert get_og_url(soup) == exp_out
+
+
+@pytest.mark.parametrize(
+    ("file", "exp_out"),
+    [
+        ("link_canonical.html", "https://example.org/page_canon/"),
+        ("og_url.html", "https://example.org/page_og/"),
+        ("self_url_both.html", "https://example.org/page_canon/"),
+        ("no_self_url.html", None),
+    ],
+)
+def test_extract_self_url(datadir, file, exp_out):
+    soup = BeautifulSoup((datadir / file).read_text(), "lxml")
+    assert extract_self_url(soup) == exp_out
diff --git a/tests/scrape/test_processor/link_canonical.html b/tests/scrape/test_processor/link_canonical.html
new file mode 100644
index 0000000..5408e7f
--- /dev/null
+++ b/tests/scrape/test_processor/link_canonical.html
@@ -0,0 +1,10 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+        <link rel="canonical" href="https://example.org/page_canon/">
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_processor/link_canonical_empty_href.html b/tests/scrape/test_processor/link_canonical_empty_href.html
new file mode 100644
index 0000000..37f9b6b
--- /dev/null
+++ b/tests/scrape/test_processor/link_canonical_empty_href.html
@@ -0,0 +1,10 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+        <link rel="canonical" href="">
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_processor/link_canonical_no_href.html b/tests/scrape/test_processor/link_canonical_no_href.html
new file mode 100644
index 0000000..ce095fd
--- /dev/null
+++ b/tests/scrape/test_processor/link_canonical_no_href.html
@@ -0,0 +1,10 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+        <link rel="canonical">
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_processor/no_head.html b/tests/scrape/test_processor/no_head.html
new file mode 100644
index 0000000..a7af195
--- /dev/null
+++ b/tests/scrape/test_processor/no_head.html
@@ -0,0 +1,6 @@
+<!doctype html>
+<html>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_processor/no_self_url.html b/tests/scrape/test_processor/no_self_url.html
new file mode 100644
index 0000000..152ee7f
--- /dev/null
+++ b/tests/scrape/test_processor/no_self_url.html
@@ -0,0 +1,9 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_processor/og_url.html b/tests/scrape/test_processor/og_url.html
new file mode 100644
index 0000000..b6f0f60
--- /dev/null
+++ b/tests/scrape/test_processor/og_url.html
@@ -0,0 +1,10 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+        <meta property="og:url" content="https://example.org/page_og/">
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_processor/og_url_empty_content.html b/tests/scrape/test_processor/og_url_empty_content.html
new file mode 100644
index 0000000..27068b4
--- /dev/null
+++ b/tests/scrape/test_processor/og_url_empty_content.html
@@ -0,0 +1,10 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+        <meta property="og:url" content="">
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_processor/og_url_no_content.html b/tests/scrape/test_processor/og_url_no_content.html
new file mode 100644
index 0000000..a5ae81c
--- /dev/null
+++ b/tests/scrape/test_processor/og_url_no_content.html
@@ -0,0 +1,10 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+        <meta property="og:url">
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_processor/self_url_both.html b/tests/scrape/test_processor/self_url_both.html
new file mode 100644
index 0000000..0404d7a
--- /dev/null
+++ b/tests/scrape/test_processor/self_url_both.html
@@ -0,0 +1,11 @@
+<!doctype html>
+<html>
+    <head>
+        <title>Test Document</title>
+        <link rel="canonical" href="https://example.org/page_canon/">
+        <meta property="og:url" content="https://example.org/page_og/">
+    </head>
+    <body>
+        <h1>test document</h1>
+    </body>
+</html>
\ No newline at end of file
diff --git a/tests/scrape/test_scrape.py b/tests/scrape/test_scrape.py
new file mode 100644
index 0000000..b80e083
--- /dev/null
+++ b/tests/scrape/test_scrape.py
@@ -0,0 +1,16 @@
+from wpextract.scrape.scrape import load_scrape
+
+
+def test_load_scrape(shared_datadir):
+    scrape_urls_files = {
+        "https://example.org/an-example-post/": shared_datadir
+        / "scrape"
+        / "an-example-post"
+        / "index.html",
+    }
+
+    soup = load_scrape(scrape_urls_files, "https://example.org/an-example-post/")
+    assert soup.title.string == "An Example Post - example.org"
+
+    not_found = load_scrape(scrape_urls_files, "https://example.org/not-found/")
+    assert not_found is None