From ad1b8e4ea4c190c8bbfc4717cc9145c6746a1615 Mon Sep 17 00:00:00 2001 From: Jack Cushman Date: Wed, 4 Dec 2024 14:23:54 -0500 Subject: [PATCH] Refactor to support collection backends, allowing for changing output path --- README.md | 28 ++++++- src/nabit/bin/cli.py | 47 ++++++++++-- src/nabit/lib/archive.py | 46 +++--------- src/nabit/lib/backends/__init__.py | 0 src/nabit/lib/backends/base.py | 20 +++++ src/nabit/lib/backends/path.py | 45 +++++++++++ src/nabit/lib/{capture.py => backends/url.py} | 75 ++++++++++++------- tests/backends/__init__.py | 0 .../test_path_backend.py} | 9 +-- tests/backends/test_url_backend.py | 21 ++++++ tests/conftest.py | 17 ++++- tests/test_archive.py | 21 +----- tests/test_cli.py | 62 +++++++++++++++ 13 files changed, 291 insertions(+), 100 deletions(-) create mode 100644 src/nabit/lib/backends/__init__.py create mode 100644 src/nabit/lib/backends/base.py create mode 100644 src/nabit/lib/backends/path.py rename src/nabit/lib/{capture.py => backends/url.py} (73%) create mode 100644 tests/backends/__init__.py rename tests/{test_capture.py => backends/test_path_backend.py} (90%) create mode 100644 tests/backends/test_url_backend.py diff --git a/README.md b/README.md index dda0cfe..2ca0188 100644 --- a/README.md +++ b/README.md @@ -132,9 +132,12 @@ Usage: [OPTIONS] BAG_PATH Options: -a, --amend Update an existing archive. May add OR OVERWRITE existing data. - -u, --url TEXT URL to archive (can be repeated) + -u, --url TEXT URL to archive (can be repeated). May be a + bare url or a JSON dict with a "url" key and + an optional "output" key -p, --path PATH File or directory to archive (can be repeated) + -c, --collect TEXT Collection tasks in JSON format --hard-link Use hard links when copying files (when possible) -i, --info TEXT bag-info.txt metadata in key:value format @@ -206,6 +209,29 @@ In many situations it may make sense to create and sign bags on different machin ``` * The signed bag is then published to the archive, perhaps simply by copying the bag directory to a public file server. +Collection backends +------------------- + +`bag-nabit` is not primarily a web archiving tool, but it supports collection backends that can gather both web content and file content. Collection tasks can be provided as JSON content passed to the `--collect` flag to `nabit archive`: + +``` +nabit archive example_bag --collect '[ + {"backend": "url", "url": "https://example.com/", "output": "example_com.html"}, + {"backend": "path", "path": "/path/to/local/file"} +]' +``` + +Currently supported collection backends are: + +* `url`: fetch URLs with python `requests`, following redirects. Write metadata to data/headers.warc. Equivalent to the `-u` flag to `nabit archive`. Keys: + * `url`: the URL to fetch + * `output` (optional): the path to save the fetched content to in the bag, relative to `data/files/`. If not provided, the content will be saved to `data/files/`, where `` is the last path component of the URL. +* `path`: copy local files or directories to the bag. Equivalent to the `-p` flag to `nabit archive`. Keys: + * `path`: the path to the local file or directory to copy + * `output` (optional): the path to save the fetched content to in the bag. + +Future backends could include ftp, web crawlers, etc. + File format ----------- diff --git a/src/nabit/bin/cli.py b/src/nabit/bin/cli.py index d90ad22..aafdc52 100644 --- a/src/nabit/bin/cli.py +++ b/src/nabit/bin/cli.py @@ -4,8 +4,10 @@ from pathlib import Path from .utils import assert_file_exists, assert_url, cli_validate, CaptureCommand -from ..lib.archive import package, validate_package +from ..lib.archive import package from ..lib.sign import KNOWN_TSAS +from ..lib.backends.base import CollectionTask +from ..lib.backends.path import PathCollectionTask @click.group() def main(): @@ -16,8 +18,9 @@ def main(): @main.command(cls=CaptureCommand) @click.argument('bag_path', type=click.Path(path_type=Path)) @click.option('--amend', '-a', is_flag=True, help='Update an existing archive. May add OR OVERWRITE existing data.') -@click.option('--url', '-u', 'urls', multiple=True, help='URL to archive (can be repeated)') +@click.option('--url', '-u', 'urls', multiple=True, help='URL to archive (can be repeated). May be a bare url or a JSON dict with a "url" key and an optional "output" key') @click.option('--path', '-p', 'paths', multiple=True, type=click.Path(exists=True, path_type=Path), help='File or directory to archive (can be repeated)') +@click.option('--collect', '-c', 'collect', help='Collection tasks in JSON format') @click.option('--hard-link', is_flag=True, help='Use hard links when copying files (when possible)') @click.option('--info', '-i', multiple=True, help='bag-info.txt metadata in key:value format (can be repeated)') @click.option('--signed-metadata', 'signed_metadata_path', type=click.Path(exists=True, path_type=Path, dir_okay=False), @@ -43,6 +46,7 @@ def archive( amend, urls, paths, + collect, hard_link, info, signed_metadata_path, @@ -98,9 +102,41 @@ def archive( raise click.BadParameter(f'Metadata must be in "key:value" format, got "{item}"') bag_info[key.strip()].append(value.strip()) - # validate URLs + # Convert collect to list if it's a tuple + if collect: + try: + collect = json.loads(collect) + except json.JSONDecodeError: + raise click.BadParameter(f'Invalid JSON string for --collect: {collect}') + if not isinstance(collect, list): + raise click.BadParameter(f'--collect must be a list of JSON objects, got {collect}') + else: + collect = [] + + # Append --url and --path to --collect for url in urls: - assert_url(url) + try: + url_dict = json.loads(url) + url_dict['backend'] = 'url' + except json.JSONDecodeError: + url_dict = {'backend': 'url', 'url': url} + collect.append(url_dict) + for path in paths: + collect.append({'backend': 'path', 'path': str(path)}) + + # Process and validate collect + processed_collect = [] + for task in collect: + try: + processed_collect.append(CollectionTask.from_dict(task)) + except Exception as e: + raise click.BadParameter(f'Invalid task definition for --collect: {task} resulted in {e}') + + # handle --hard-link option + if hard_link: + for task in processed_collect: + if isinstance(task, PathCollectionTask): + task.hard_links = True ## handle --sign and --timestamp options # order matters, so get ordered list of signature flags from sys.argv @@ -139,8 +175,7 @@ def archive( package( output_path=bag_path, - paths=paths, - urls=urls, + collect=processed_collect, bag_info=bag_info, signatures=signatures, signed_metadata=metadata['signed'], diff --git a/src/nabit/lib/archive.py b/src/nabit/lib/archive.py index f5214eb..243cb7f 100644 --- a/src/nabit/lib/archive.py +++ b/src/nabit/lib/archive.py @@ -1,17 +1,16 @@ from pathlib import Path -import shutil from datetime import date import bagit import os -from .utils import get_unique_path, noop -from .capture import validate_warc_headers, capture -from .sign import validate_signatures, KNOWN_TSAS, add_signatures -from .. import __version__ import hashlib import json -# files to ignore when copying directories -IGNORE_PATTERNS = ['.DS_Store'] +from .utils import noop +from .backends.url import validate_warc_headers +from .sign import validate_signatures, KNOWN_TSAS, add_signatures +from .. import __version__ +from .backends.base import CollectionTask + def validate_bag_format(bag_path: Path, error, warn, success) -> None: """Verify bag format.""" @@ -54,34 +53,10 @@ def error(message: str, metadata: dict | None = None) -> None: validate_bag_format(bag_path, error, warn, success) validate_signatures(tagmanifest_path, error, warn, success) -def copy_paths(source_paths: list[Path | str], dest_dir: Path, use_hard_links: bool = False) -> None: - """Copy paths to a destination directory, optionally using hard links.""" - for path in source_paths: - path = Path(path) - dest_path = get_unique_path(dest_dir / path.name) - # can only use hard links if source and destination are on the same device - use_hard_links = use_hard_links and os.stat(path).st_dev == os.stat(dest_dir).st_dev - if path.is_file(): - if use_hard_links: - os.link(path, dest_path) - else: - shutil.copy2(path, dest_path) - else: - copy_function = os.link if use_hard_links else shutil.copy2 - # link directory contents recursively - shutil.copytree( - path, - dest_path, - dirs_exist_ok=True, - copy_function=copy_function, - ignore=shutil.ignore_patterns(*IGNORE_PATTERNS) - ) - def package( output_path: Path | str, amend: bool = False, - urls: list[str] | None = None, - paths: list[Path | str] | None = None, + collect: list[CollectionTask] | None = None, bag_info: dict | None = None, signatures: list[dict] | None = None, signed_metadata: dict | None = None, @@ -105,10 +80,9 @@ def package( files_path = data_path / 'files' files_path.mkdir(exist_ok=True, parents=True) - if urls: - capture(urls, data_path / 'headers.warc') - if paths: - copy_paths(paths, files_path, use_hard_links) + if collect: + for task in collect: + task.collect(files_path) # Add metadata files if signed_metadata is not None: diff --git a/src/nabit/lib/backends/__init__.py b/src/nabit/lib/backends/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nabit/lib/backends/base.py b/src/nabit/lib/backends/base.py new file mode 100644 index 0000000..2b8e47e --- /dev/null +++ b/src/nabit/lib/backends/base.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass +from functools import lru_cache + +@lru_cache +def get_backends() -> dict[str, type['CollectionTask']]: + # do this in a cached function to avoid circular import + from .url import UrlCollectionTask + from .path import PathCollectionTask + + return { + 'url': UrlCollectionTask, + 'path': PathCollectionTask, + } + +@dataclass +class CollectionTask: + @classmethod + def from_dict(cls, data: dict) -> 'CollectionTask': + backend = data.pop('backend') + return get_backends()[backend](**data) diff --git a/src/nabit/lib/backends/path.py b/src/nabit/lib/backends/path.py new file mode 100644 index 0000000..5170a0b --- /dev/null +++ b/src/nabit/lib/backends/path.py @@ -0,0 +1,45 @@ +import os +import shutil +from pathlib import Path +from dataclasses import dataclass +from ..utils import get_unique_path +from .base import CollectionTask + +@dataclass +class PathCollectionTask(CollectionTask): + """Collect files or directories from the local filesystem.""" + path: Path + output: Path | None = None + hard_links: bool = False + + ignore_patterns = ['.DS_Store'] + + def __post_init__(self): + """Validate the path and ensure it's a Path object.""" + self.path = Path(self.path) # Coerce to Path if it's a string + if not self.path.exists(): + raise ValueError(f'Path "{self.path}" does not exist') + if self.output is not None: + self.output = Path(self.output) # Also coerce output if provided + + def collect(self, files_dir: Path) -> None: + """Copy paths to a destination directory, optionally using hard links.""" + path = self.path + dest_path = get_unique_path(files_dir / path.name) + # can only use hard links if source and destination are on the same device + use_hard_links = self.hard_links and os.stat(path).st_dev == os.stat(files_dir).st_dev + if path.is_file(): + if use_hard_links: + os.link(path, dest_path) + else: + shutil.copy2(path, dest_path) + else: + copy_function = os.link if use_hard_links else shutil.copy2 + # link directory contents recursively + shutil.copytree( + path, + dest_path, + dirs_exist_ok=True, + copy_function=copy_function, + ignore=shutil.ignore_patterns(*self.ignore_patterns) + ) diff --git a/src/nabit/lib/capture.py b/src/nabit/lib/backends/url.py similarity index 73% rename from src/nabit/lib/capture.py rename to src/nabit/lib/backends/url.py index 542d0da..b07ea5d 100644 --- a/src/nabit/lib/capture.py +++ b/src/nabit/lib/backends/url.py @@ -4,9 +4,12 @@ from urllib.parse import urlparse import mimetypes from pathlib import Path -import requests # requests must be imported after capture_http +import requests import os -from nabit.lib.utils import get_unique_path +from dataclasses import dataclass +from ..utils import get_unique_path +from .base import CollectionTask + """ This file handles capturing of URLs and request/response metadata. We use an unpacked WARC format to make it easier to access underlying data files. @@ -29,11 +32,36 @@ even if the original response was gzip encoded in transit. """ +@dataclass +class UrlCollectionTask(CollectionTask): + """Collect URLs and request/response metadata.""" + url: str + output: Path | None = None + + def __post_init__(self): + """Validate the URL by attempting to prepare a request.""" + requests.Request('GET', self.url).prepare() + + def collect(self, files_dir: Path) -> None: + """ + Capture URL to a WARC file using our custom FileWriter. + Appends to the WARC file if it already exists. + """ + warc_path = files_dir.parent / 'headers.warc' + with open(warc_path, 'ab') as fh: + warc_writer = FileWriter(fh, warc_path, gzip=False) + with capture_http(warc_writer): + warc_writer.custom_out_path = self.output + requests.get(self.url) + + class FileWriter(WARCWriter): """ A WARC writer that stores response bodies uncompressed in the files/ directory. """ revisit_status_codes = set(['200', '203']) + custom_out_path = None # override output path + def __init__(self, filebuf, warc_path: Path, *args, **kwargs): super(WARCWriter, self).__init__(*args, **kwargs) self.out = filebuf @@ -49,19 +77,23 @@ def _write_warc_record(self, out, record): headers.replace_header('WARC-Type', 'revisit') ## get a filename for the response body - uri = headers.get_header('WARC-Target-URI') - parsed_url = urlparse(uri) - filename = Path(parsed_url.path.split('/')[-1]) - # set stem - stem = filename.stem.lstrip('.') or 'data' - # set extension - extension = filename.suffix - if not extension: - if content_type := record.http_headers.get_header('Content-Type'): # pragma: no branch - extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False) - if not extension: - extension = '.unknown' # pragma: no cover - out_path = get_unique_path(self.files_path / f'{stem}{extension}') + if self.custom_out_path is not None: + out_path = self.custom_out_path + else: + uri = headers.get_header('WARC-Target-URI') + parsed_url = urlparse(uri) + filename = Path(parsed_url.path.split('/')[-1]) + # set stem + stem = filename.stem.lstrip('.') or 'data' + # set extension + extension = filename.suffix + if not extension: + if content_type := record.http_headers.get_header('Content-Type'): # pragma: no branch + extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False) + if not extension: + extension = '.unknown' # pragma: no cover + out_path = f'{stem}{extension}' + out_path = get_unique_path(self.files_path / out_path) relative_path = out_path.relative_to(self.warc_path.parent) # add our custom WARC-Profile header @@ -90,19 +122,6 @@ def _write_warc_record(self, out, record): return super()._write_warc_record(out, record) - -def capture(urls: list[str], warc_path: Path, request_kwargs: dict = {}) -> None: - """ - Capture a list of URLs to a WARC file using our custom FileWriter. - Appends to the WARC file if it already exists. - """ - use_gzip = str(warc_path).endswith('.gz') - with open(warc_path, 'ab') as fh: - warc_writer = FileWriter(fh, warc_path, gzip=use_gzip) - with capture_http(warc_writer): - for url in urls: - requests.get(url, **request_kwargs) - def validate_warc_headers(headers_path: Path, error, warn, success) -> None: """ Validate a headers.warc file created by capture(). diff --git a/tests/backends/__init__.py b/tests/backends/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_capture.py b/tests/backends/test_path_backend.py similarity index 90% rename from tests/test_capture.py rename to tests/backends/test_path_backend.py index ba28734..2ecd01d 100644 --- a/tests/test_capture.py +++ b/tests/backends/test_path_backend.py @@ -1,7 +1,6 @@ import pytest -from pathlib import Path from warcio.archiveiterator import ArchiveIterator -from nabit.lib.capture import capture +from nabit.lib.backends.url import UrlCollectionTask @pytest.fixture @@ -19,7 +18,7 @@ def capture_dir(tmp_path): def test_capture_with_content(capture_dir, server): """Test capturing a 200 response with body content""" - capture([server.url_for("/test.txt")], capture_dir["headers_path"]) + UrlCollectionTask(url=server.url_for("/test.txt")).collect(capture_dir["files_dir"]) # Check headers.warc with open(capture_dir["headers_path"], 'rb') as fh: @@ -41,7 +40,7 @@ def test_capture_empty_response(capture_dir, server): # Add empty response to server server.expect_request("/empty").respond_with_data("") - capture([server.url_for("/empty")], capture_dir["headers_path"]) + UrlCollectionTask(url=server.url_for("/empty")).collect(capture_dir["headers_path"]) # Check headers.warc - should be a response record, not revisit with open(capture_dir["headers_path"], 'rb') as fh: @@ -65,7 +64,7 @@ def test_capture_redirect(capture_dir, server): headers={"Location": target_url} ) - capture([redirect_url], capture_dir["headers_path"]) + UrlCollectionTask(url=redirect_url).collect(capture_dir["headers_path"]) # Check headers.warc with open(capture_dir["headers_path"], 'rb') as fh: diff --git a/tests/backends/test_url_backend.py b/tests/backends/test_url_backend.py new file mode 100644 index 0000000..c1e0cc8 --- /dev/null +++ b/tests/backends/test_url_backend.py @@ -0,0 +1,21 @@ +from nabit.lib.backends.path import PathCollectionTask + + +def test_ds_store_ignored(tmp_path): + """Test that files in ignore_patterns are ignored when copying directories""" + # Setup source directory + source_dir = tmp_path / "test_dir" + source_dir.mkdir() + (source_dir / ".DS_Store").write_text("ignored") + (source_dir / "test.txt").write_text("included") + + # Setup destination directory + dest_dir = tmp_path / "output" + dest_dir.mkdir() + + # Test copying + PathCollectionTask(path=str(source_dir)).collect(dest_dir) + + # Verify results + assert not (dest_dir / "test_dir/.DS_Store").exists() + assert (dest_dir / "test_dir/test.txt").read_text() == "included" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 9e173ae..500c5a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,8 @@ from nabit.lib.archive import package from nabit.lib.sign import KNOWN_TSAS - +from nabit.lib.backends.path import PathCollectionTask +from nabit.lib.backends.url import UrlCollectionTask @pytest.fixture def test_files(tmp_path): @@ -26,7 +27,10 @@ def test_bag(tmp_path, test_files): bag_path = tmp_path / "test_bag" package( output_path=bag_path, - paths=test_files["payload"], + collect=[ + PathCollectionTask(path=str(test_files["payload"][0])), + PathCollectionTask(path=str(test_files["payload"][1])) + ], signed_metadata=test_files["signed_metadata"].read_text(), unsigned_metadata=test_files["unsigned_metadata"].read_text(), bag_info={"Source-Organization": "Test Org"} @@ -40,7 +44,9 @@ def warc_bag(tmp_path, server): bag_path = tmp_path / "warc_bag" package( output_path=bag_path, - urls=[server.url_for("/")], + collect=[ + UrlCollectionTask(url=server.url_for("/")) + ], bag_info={"Source-Organization": "Test Org"} ) return bag_path @@ -59,7 +65,10 @@ def signed_bag(tmp_path, test_files, root_ca): # TODO: don't call out to live TSA server package( output_path=bag_path, - paths=test_files["payload"], + collect=[ + PathCollectionTask(path=str(test_files["payload"][0])), + PathCollectionTask(path=str(test_files["payload"][1])) + ], signed_metadata=test_files["signed_metadata"].read_text(), unsigned_metadata=test_files["unsigned_metadata"].read_text(), bag_info={"Source-Organization": "Test Org"}, diff --git a/tests/test_archive.py b/tests/test_archive.py index baaaf26..f74e158 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -1,26 +1,7 @@ import pytest -from nabit.lib.archive import copy_paths, validate_package +from nabit.lib.archive import validate_package -def test_ds_store_ignored(tmp_path): - """Test that files in IGNORE_PATTERNS are ignored when copying directories""" - # Setup source directory - source_dir = tmp_path / "test_dir" - source_dir.mkdir() - (source_dir / ".DS_Store").write_text("ignored") - (source_dir / "test.txt").write_text("included") - - # Setup destination directory - dest_dir = tmp_path / "output" - dest_dir.mkdir() - - # Test copying - copy_paths([source_dir], dest_dir) - - # Verify results - assert not (dest_dir / "test_dir/.DS_Store").exists() - assert (dest_dir / "test_dir/test.txt").read_text() == "included" - def test_validate_raises(tmp_path): # make sure that vanilla validate_package raises an error # unless there's an error callback that does something else diff --git a/tests/test_cli.py b/tests/test_cli.py index 949ed76..51a6fe6 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -306,6 +306,45 @@ def test_duplicate_file_names(runner, tmp_path, server): files = sorted((p.name for p in (bag_path / "data" / "files").glob("data*.html"))) assert re.match(r"data-[0-9a-zA-Z]{6}\.html;data-[0-9a-zA-Z]{6}\.html;data\.html", ";".join(files)) +def test_url_with_custom_output_path(runner, tmp_path, server): + """Test archiving a URL with a custom output path""" + bag_path = tmp_path / 'bag' + custom_output_path = 'custom_output.html' + + run(runner, [ + 'archive', + str(bag_path), + '-u', f'{{"url": "{server.url_for("/")}", "output": "{custom_output_path}"}}', + '-u', server.url_for("/another.html"), + ]) + + # Verify that the file is saved with the custom output path + assert (bag_path / f'data/files/{custom_output_path}').read_text() == 'root content' + assert (bag_path / f'data/files/another.html').read_text() == 'another content' + assert validate_passing(bag_path) == snapshot("""\ +SUCCESS: headers.warc found +SUCCESS: bag format is valid +WARNING: No signatures found +WARNING: No timestamps found\ +""") + +def test_collect_json(runner, tmp_path, server): + """Test successful parsing of --collect JSON""" + bag_path = tmp_path / 'bag' + collect_tasks = [ + {"backend": "url", "url": server.url_for("/")}, + {"backend": "url", "url": server.url_for("/another.html"), "output": "custom.html"} + ] + + run(runner, [ + 'archive', + str(bag_path), + '--collect', json.dumps(collect_tasks) + ]) + + assert (bag_path / 'data/files/data.html').read_text() == 'root content' + assert (bag_path / 'data/files/custom.html').read_text() == 'another content' + ## validation errors def test_invalid_metadata_file_extension(runner, tmp_path): @@ -398,3 +437,26 @@ def test_empty_package(runner, tmp_path): 'archive', str(tmp_path), ], exit_code=1, output='No files in data/files') + +def test_invalid_collect_json(runner, tmp_path): + """Test error handling for invalid --collect JSON""" + # Test invalid JSON syntax + run(runner, [ + 'archive', + str(tmp_path / 'bag'), + '--collect', '{invalid json}' + ], exit_code=2, output='Invalid JSON string for --collect') + + # Test non-list JSON + run(runner, [ + 'archive', + str(tmp_path / 'bag'), + '--collect', '{"not": "a list"}' + ], exit_code=2, output='--collect must be a list of JSON objects') + + # Test invalid task definition + run(runner, [ + 'archive', + str(tmp_path / 'bag'), + '--collect', '[{"backend": "invalid"}]' + ], exit_code=2, output='Invalid task definition for --collect')