Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate scraping #15

Merged
merged 24 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
WordPress Site Extractor
Copyright 2022-2024 The University of Sheffield

Portions of this code are derived from WPJsonScraper, which is available under the MIT license. For details, see src/extractor/dl/LICENSE.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ The extractor can also be used as a library instead of on the command line.

Typically, you would instantiate a new [`WPExtractor`](src/extractor/extract.py) instance and call its `extract` method. The dataframes can then be accessed as class attributes or exported with the `export` method.

An example usage is available in the CLI script ([`extractor.cli`](src/extractor/cli.py)).
An example usage is available in the CLI script ([`extractor.cli`](src/extractor/cli/cli.py)).

When using this approach, it's possible to use customised translation pickers (see the `translation_pickers` argument of `WPExtractor`). These should be child classes of [`extractor.parse.translations.LangPicker`](src/extractor/parse/translations/_pickers.py).

Expand Down Expand Up @@ -207,3 +207,9 @@ For each resolved link, translation, or media, a destination is set containing i
### 5. Export

The columns of each type are subset and exported as a JSON file each.

## Acknowledgements and License

This software is made available under the terms of the [Apache License version 2.0](LICENSE).

Portions of this software are derived from other works, see [the `NOTICE` file](NOTICE) for further information.
209 changes: 208 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ authors=["Freddy Heppell <[email protected]>"]
packages=[
{ include = "extractor", from = "src"}
]
repository="https://github.com/GateNLP/wordpress-site-extractor"
license="Apache-2.0"

[tool.poetry.scripts]
wpextract = "extractor.cli:main"
wpextract = "extractor.cli.cli:main"

# Workaround for https://github.com/python-poetry/poetry/issues/9293
[[tool.poetry.source]]
Expand All @@ -23,6 +25,8 @@ lxml = ">=5.2.2"
numpy = ">=1.26.4"
pandas = ">=2.2.2"
tqdm = ">=4.66.4"
requests = "^2.32.3"
docconvert = "^2.1.0"

[tool.poetry.group.dev.dependencies]
build = "==0.9.*,>=0.9.0"
Expand Down Expand Up @@ -58,6 +62,12 @@ ignore = [
"D103", # Ignore method docstring errors in tests
"PD901", # Allow `df` variable name in tests
]
"src/extractor/dl/*" = [
"D415",
"D103",
"D101",
"D107"
]

[tool.ruff.lint.pydocstyle]
convention = "google"
Expand Down
76 changes: 0 additions & 76 deletions src/extractor/cli.py

This file was deleted.

Empty file added src/extractor/cli/__init__.py
Empty file.
133 changes: 133 additions & 0 deletions src/extractor/cli/_dl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from argparse import Namespace

from extractor.dl.downloader import WPDownloader
from extractor.dl.requestsession import RequestSession

dl_types = ["categories", "media", "pages", "posts", "tags", "users"]


def register_dl_parser(subparsers):
"""Register the `dl` subcommand."""
parser_dl = subparsers.add_parser(
"dl", help="Download a site's content using the WordPress REST API."
)
parser_dl.add_argument(
"target",
type=str,
help="the base path of the WordPress installation to examine",
)
parser_dl.add_argument(
"out_json",
type=str,
help="the path of the output JSON file",
)
parser_dl.add_argument(
"--media-dest",
type=str,
default=None,
help="Path to download media files, skipped if not supplied.",
)
type_group = parser_dl.add_argument_group("data types")
for dl_type in dl_types:
type_group.add_argument(
f"--no-{dl_type}",
dest=dl_type,
action="store_false",
help=f"Don't download {dl_type}",
)
parser_dl.set_defaults(**{dl_type: True for dl_type in dl_types})

auth_group = parser_dl.add_argument_group("authentication")
auth_group.add_argument("--proxy", "-P", help="Define a proxy server to use")
auth_group.add_argument(
"--auth", help="Define HTTP Basic credentials in format username:password"
)
auth_group.add_argument(
"--cookies",
help='define cookies to send with request in the format "cookie1=foo; cookie2=bar"',
)

req_group = parser_dl.add_argument_group("request behaviour")
req_group.add_argument(
"--timeout",
type=int,
default=30,
help="Stop waiting for a response after a given number of seconds (default: %(default)s)",
)
req_group.add_argument(
"--wait",
"-w",
type=float,
help="Wait the specified number of seconds between retrievals",
)
req_group.add_argument(
"--random-wait",
action="store_true",
help="Randomly varies the time between requests to between 0.5 and 1.5 times the number of seconds set by --wait",
)
req_group.set_defaults(random_wait=False)
req_group.add_argument(
"--max-retries",
type=int,
default=10,
help="Maximum number of retries before giving up (default: %(default)s)",
)
req_group.add_argument(
"--backoff-factor",
type=float,
default=0.1,
help="Factor to apply delaying retries. Default will sleep for 0.0, 0.2, 0.4, 0.8,... (default: %(default)s)",
)
req_group.add_argument(
"--max-redirects",
type=int,
default=20,
help="Maximum number of redirects before giving up (default: %(default)s)",
)


def do_dl(parser, args: Namespace):
"""Perform the `dl` subcommand."""
types_to_dl = [dl_type for dl_type in dl_types if vars(args)[dl_type]]

if args.random_wait and args.wait is None:
parser.error(
"argument --random-wait: cannot be used unless --wait/-w is also set"
)

target = args.target
if not (target.startswith("http://") or target.startswith("https://")):
target = "http://" + target
if not target.endswith("/"):
target += "/"

auth = None
if args.auth is not None:
auth_list = args.auth.split(":")
if len(auth_list) == 1:
auth = (auth_list[0], "")
elif len(auth_list) >= 2:
auth = (auth_list[0], ":".join(auth_list[1:]))

session = RequestSession(
proxy=args.proxy,
cookies=args.cookies,
authorization=auth,
timeout=args.timeout,
wait=args.wait,
random_wait=args.random_wait,
max_retries=args.max_retries,
backoff_factor=args.backoff_factor,
)

downloader = WPDownloader(
target=args.target,
out_path=args.out_json,
data_types=types_to_dl,
session=session,
)

downloader.download()

if args.media_dest:
downloader.download_media_files(session, args.media_dest)
44 changes: 44 additions & 0 deletions src/extractor/cli/_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from extractor.extract import WPExtractor
from extractor.util.args import directory, empty_directory


def register_extract_parser(subparsers):
"""Register the `extract` subcommand."""
parser_extract = subparsers.add_parser(
"extract", help="Convert the downloaded data files into a dataset."
)

parser_extract.add_argument(
"json_root", help="JSON dump of the site", type=directory
)
parser_extract.add_argument(
"out_dir", help="Output directory", type=empty_directory
)
parser_extract.add_argument(
"--scrape-root",
"-S",
help="Root directory of an HTML scrape",
type=directory,
required=False,
default=None,
)
parser_extract.add_argument(
"--json-prefix",
"-P",
help="Prefix to the JSON files",
type=str,
required=False,
default=None,
)
parser_extract.set_defaults(feature=True)


def do_extract(parser, args):
"""Perform the extract command."""
extractor = WPExtractor(
json_root=args.json_root,
scrape_root=args.scrape_root,
json_prefix=args.json_prefix,
)
extractor.extract()
extractor.export(args.out_dir)
74 changes: 74 additions & 0 deletions src/extractor/cli/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import argparse
import logging
from importlib.metadata import version

from tqdm.auto import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from extractor.cli._dl import do_dl, register_dl_parser
from extractor.cli._extract import do_extract, register_extract_parser


def _exec_command(parser, args):
if args.command == "parse":
do_extract(parser, args)
elif args.command == "dl":
do_dl(parser, args)
else:
raise ValueError("Unknown command")


def _get_version():
return version("wp-site-extractor")


def main() -> None:
"""Entrypoint for CLI."""
parser = argparse.ArgumentParser(
prog="wpextract",
description="Create datasets from WordPress sites using the REST API",
)

parser.add_argument(
"--version", action="version", version="%(prog)s " + _get_version()
)

parser.add_argument(
"--log",
"-l",
help="File to log to. Will suppress stdout.",
type=str,
required=False,
default=None,
)
parser.add_argument(
"--verbose",
"-v",
help="Increase log level to include debug logs",
action="store_true",
)

subparsers = parser.add_subparsers(
dest="command",
required=True,
title="commands",
)

register_extract_parser(subparsers)
register_dl_parser(subparsers)

args = parser.parse_args()

log_level = logging.DEBUG if args.verbose else logging.INFO
if args.log is not None:
logging.basicConfig(filename=args.log, level=log_level)
else:
logging.basicConfig(level=log_level)

tqdm.pandas()

if args.log is None:
with logging_redirect_tqdm():
_exec_command(parser, args)
else:
_exec_command(parser, args)
Loading