Skip to content

Commit

Permalink
Allow empty scrape properly
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Jul 3, 2024
1 parent de859bd commit 3d74989
Show file tree
Hide file tree
Showing 13 changed files with 1,265 additions and 15 deletions.
9 changes: 8 additions & 1 deletion src/extractor/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,15 @@ def main() -> None:
)

parser.add_argument("json_root", help="JSON dump of the site", type=directory)
parser.add_argument("scrape_root", help="HTML scrape of the site", type=directory)
parser.add_argument("out_dir", help="Output directory", type=empty_directory)
parser.add_argument(
"--scrape-root",
'-S',
help="Root directory of an HTML scrape",
type=directory,
required=False,
default=None
)
parser.add_argument(
"--json-prefix",
"-P",
Expand Down
18 changes: 14 additions & 4 deletions src/extractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@
class WPExtractor:
"""Main class to extract data."""

json_root: Path
scrape_root: Optional[Path]
json_prefix: Optional[str]
link_registry: LinkRegistry

posts: Optional[DataFrame]
media: Optional[DataFrame]
tags: Optional[DataFrame]
Expand All @@ -38,7 +42,7 @@ class WPExtractor:
def __init__(
self,
json_root: Path,
scrape_root: Path,
scrape_root: Optional[Path] = None,
json_prefix: Optional[str] = None,
translation_pickers: Optional[PickerListType] = None,
):
Expand Down Expand Up @@ -81,6 +85,11 @@ def _prefix_filename(self, file_name):
return prefix_filename(file_name, self.json_prefix)

def _crawl_scrape(self):
if self.scrape_root is None:
logging.info("No scrape root specified, skipping")
self.scrape_url_mapping = {}
return

crawl = ScrapeCrawl(self.scrape_root)
crawl.crawl()
self.scrape_url_mapping = crawl.get_link_abs_path()
Expand Down Expand Up @@ -116,9 +125,10 @@ def _extract_pages(self):

def _resolve_post_links(self):
self.posts = resolve_post_links(self.link_registry, self.posts)
self.posts = resolve_post_translations(self.link_registry, self.posts)
self.posts = ensure_translations_undirected(self.posts)
self.posts = resolve_post_translations(self.link_registry, self.posts)
if "translations" in self.posts.columns:
self.posts = resolve_post_translations(self.link_registry, self.posts)
self.posts = ensure_translations_undirected(self.posts)
self.posts = resolve_post_translations(self.link_registry, self.posts)

def export(self, out_dir: Path) -> None:
"""Save scrape results to ``out_dir``."""
Expand Down
23 changes: 13 additions & 10 deletions src/extractor/extractors/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def load_posts(
path: Path,
link_registry: LinkRegistry,
scrape_urls_files: Dict[str, Path],
translation_pickers: Optional[PickerListType],
translation_pickers: Optional[PickerListType] = None,
) -> Optional[pd.DataFrame]:
"""Load the posts from a JSON file.
Expand Down Expand Up @@ -102,20 +102,23 @@ def load_posts(
tqdm.pandas(desc="Parsing Content")
posts_df["content.bs"] = posts_df["content.rendered"].progress_apply(parse_html)

tqdm.pandas(desc="Parsing Scrape")
posts_df["scrape_bs"] = posts_df["link"].progress_apply(
lambda link: load_scrape(scrape_urls_files, link)
)
posts_df[["language", "translations"]] = posts_df.apply(
lambda r: extract_translations(r["scrape_bs"], r["link"], translation_pickers),
axis=1,
)
if scrape_urls_files != {}:
tqdm.pandas(desc="Parsing Scrape")
posts_df["scrape_bs"] = posts_df["link"].progress_apply(
lambda link: load_scrape(scrape_urls_files, link)
)
posts_df[["language", "translations"]] = posts_df.apply(
lambda r: extract_translations(r["scrape_bs"], r["link"], translation_pickers),
axis=1,
)
else:
logging.info("SKipping translation extraction")

link_registry.add_linkables(
"post", posts_df["link"].to_list(), posts_df.index.to_list()
)

tqdm.pandas(desc="Extracting scrape")
tqdm.pandas(desc="Extracting from text")
posts_df[
["content.text", "links.internal", "links.external", "embeds", "images"]
] = posts_df.progress_apply(
Expand Down
54 changes: 54 additions & 0 deletions tests/extract/test_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from pathlib import Path

import pandas as pd

from extractor.extract import WPExtractor


def _assert_extractor_valid(extractor: WPExtractor):
assert isinstance(extractor.posts, pd.DataFrame)
assert isinstance(extractor.media, pd.DataFrame)
assert isinstance(extractor.tags, pd.DataFrame)
assert isinstance(extractor.categories, pd.DataFrame)
assert isinstance(extractor.users, pd.DataFrame)
assert isinstance(extractor.pages, pd.DataFrame)


def _assert_output_valid(out_dir: Path):
assert (out_dir / "posts.json").is_file()
assert (out_dir / "media.json").is_file()
assert (out_dir / "tags.json").is_file()
assert (out_dir / "categories.json").is_file()
assert (out_dir / "users.json").is_file()
assert (out_dir / "pages.json").is_file()


def test_extract_no_scrape(datadir):
extractor = WPExtractor(
json_root=datadir / "json",
)

extractor.extract()
_assert_extractor_valid(extractor)

out_dir = datadir / "out_json"
out_dir.mkdir()
extractor.export(out_dir)
_assert_output_valid(out_dir)


def test_extract_scrape(datadir):
extractor = WPExtractor(
json_root=datadir / "json",
scrape_root=datadir / "scrape"
)

extractor.extract()
_assert_extractor_valid(extractor)

assert 'translations' in extractor.posts.columns

out_dir = datadir / "out_json"
out_dir.mkdir()
extractor.export(out_dir)
_assert_output_valid(out_dir)
179 changes: 179 additions & 0 deletions tests/extract/test_extract/json/categories.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
[
{
"id": 1,
"count": 3,
"description": "About Test Category One",
"link": "https://example.org/test-category-one/",
"name": "Test Category One",
"slug": "test-category-one",
"taxonomy": "category",
"parent": 0,
"meta": [],
"yoast_head": "<!-- whole load of stuff -->",
"yoast_head_json": {
"title": "Test Category One - example.org",
"robots": {
"index": "index",
"follow": "follow",
"max-snippet": "max-snippet:-1",
"max-image-preview": "max-image-preview:large",
"max-video-preview": "max-video-preview:-1"
},
"canonical": "https://example.org/test-category-one/",
"og_locale": "en_US",
"og_type": "article",
"og_title": "Test Category One - example.org",
"og_url": "https://example.org/test-category-one/",
"og_site_name": "example.org",
"twitter_card": "summary_large_image",
"schema": "..."
},
"_links": {
"self": [
{
"href": "https://example.org/wp-json/wp/v2/categories/1"
}
],
"collection": [
{
"href": "https://example.org/wp-json/wp/v2/categories"
}
],
"about": [
{
"href": "https://example.org/wp-json/wp/v2/taxonomies/category"
}
],
"wp:post_type": [
{
"href": "https://example.org/wp-json/wp/v2/posts?categories=1"
}
],
"curies": [
{
"name": "wp",
"href": "https://api.w.org/{rel}",
"templated": true
}
]
}
},
{
"id": 2,
"count": 3,
"description": "About Test Category Two",
"link": "https://example.org/fr/test-category-two/",
"name": "Test Category Two",
"slug": "test-category-two",
"taxonomy": "category",
"parent": 0,
"meta": [],
"yoast_head": "<!-- whole load of stuff -->",
"yoast_head_json": {
"title": "Test Category One - example.org",
"robots": {
"index": "index",
"follow": "follow",
"max-snippet": "max-snippet:-1",
"max-image-preview": "max-image-preview:large",
"max-video-preview": "max-video-preview:-1"
},
"canonical": "https://example.org/fr/test-category-two/",
"og_locale": "en_FR",
"og_type": "article",
"og_title": "Test Category Three - example.org",
"og_url": "https://example.org/fr/test-category-two/",
"og_site_name": "example.org",
"twitter_card": "summary_large_image",
"schema": "..."
},
"_links": {
"self": [
{
"href": "https://example.org/wp-json/wp/v2/categories/1"
}
],
"collection": [
{
"href": "https://example.org/wp-json/wp/v2/categories"
}
],
"about": [
{
"href": "https://example.org/wp-json/wp/v2/taxonomies/category"
}
],
"wp:post_type": [
{
"href": "https://example.org/wp-json/wp/v2/posts?categories=2"
}
],
"curies": [
{
"name": "wp",
"href": "https://api.w.org/{rel}",
"templated": true
}
]
}
},
{
"id": 3,
"count": 3,
"description": "About Test Category Three",
"link": "https://example.org/fr/test-category-three/",
"name": "Test Category Three",
"slug": "test-category-three",
"taxonomy": "category",
"parent": 2,
"meta": [],
"yoast_head": "<!-- whole load of stuff -->",
"yoast_head_json": {
"title": "Test Category Three - example.org",
"robots": {
"index": "index",
"follow": "follow",
"max-snippet": "max-snippet:-1",
"max-image-preview": "max-image-preview:large",
"max-video-preview": "max-video-preview:-1"
},
"canonical": "https://example.org/fr/test-category-three/",
"og_locale": "en_US",
"og_type": "article",
"og_title": "Test Category One - example.org",
"og_url": "https://example.org/fr/test-category-three/",
"og_site_name": "example.org",
"twitter_card": "summary_large_image",
"schema": "..."
},
"_links": {
"self": [
{
"href": "https://example.org/wp-json/wp/v2/categories/3"
}
],
"collection": [
{
"href": "https://example.org/wp-json/wp/v2/categories"
}
],
"about": [
{
"href": "https://example.org/wp-json/wp/v2/taxonomies/category"
}
],
"wp:post_type": [
{
"href": "https://example.org/wp-json/wp/v2/posts?categories=3"
}
],
"curies": [
{
"name": "wp",
"href": "https://api.w.org/{rel}",
"templated": true
}
]
}
}
]
Loading

0 comments on commit 3d74989

Please sign in to comment.