Skip to content

Commit

Permalink
Refactor to support collection backends, allowing for changing output…
Browse files Browse the repository at this point in the history
… path
  • Loading branch information
jcushman committed Dec 4, 2024
1 parent 1dbaf9e commit ad1b8e4
Show file tree
Hide file tree
Showing 13 changed files with 291 additions and 100 deletions.
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,12 @@ Usage: [OPTIONS] BAG_PATH
Options:
-a, --amend Update an existing archive. May add OR
OVERWRITE existing data.
-u, --url TEXT URL to archive (can be repeated)
-u, --url TEXT URL to archive (can be repeated). May be a
bare url or a JSON dict with a "url" key and
an optional "output" key
-p, --path PATH File or directory to archive (can be
repeated)
-c, --collect TEXT Collection tasks in JSON format
--hard-link Use hard links when copying files (when
possible)
-i, --info TEXT bag-info.txt metadata in key:value format
Expand Down Expand Up @@ -206,6 +209,29 @@ In many situations it may make sense to create and sign bags on different machin
```
* The signed bag is then published to the archive, perhaps simply by copying the bag directory to a public file server.

Collection backends
-------------------

`bag-nabit` is not primarily a web archiving tool, but it supports collection backends that can gather both web content and file content. Collection tasks can be provided as JSON content passed to the `--collect` flag to `nabit archive`:

```
nabit archive example_bag --collect '[
{"backend": "url", "url": "https://example.com/", "output": "example_com.html"},
{"backend": "path", "path": "/path/to/local/file"}
]'
```

Currently supported collection backends are:

* `url`: fetch URLs with python `requests`, following redirects. Write metadata to data/headers.warc. Equivalent to the `-u` flag to `nabit archive`. Keys:
* `url`: the URL to fetch
* `output` (optional): the path to save the fetched content to in the bag, relative to `data/files/`. If not provided, the content will be saved to `data/files/<url_path>`, where `<url_path>` is the last path component of the URL.
* `path`: copy local files or directories to the bag. Equivalent to the `-p` flag to `nabit archive`. Keys:
* `path`: the path to the local file or directory to copy
* `output` (optional): the path to save the fetched content to in the bag.

Future backends could include ftp, web crawlers, etc.

File format
-----------

Expand Down
47 changes: 41 additions & 6 deletions src/nabit/bin/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from pathlib import Path

from .utils import assert_file_exists, assert_url, cli_validate, CaptureCommand
from ..lib.archive import package, validate_package
from ..lib.archive import package
from ..lib.sign import KNOWN_TSAS
from ..lib.backends.base import CollectionTask
from ..lib.backends.path import PathCollectionTask

@click.group()
def main():
Expand All @@ -16,8 +18,9 @@ def main():
@main.command(cls=CaptureCommand)
@click.argument('bag_path', type=click.Path(path_type=Path))
@click.option('--amend', '-a', is_flag=True, help='Update an existing archive. May add OR OVERWRITE existing data.')
@click.option('--url', '-u', 'urls', multiple=True, help='URL to archive (can be repeated)')
@click.option('--url', '-u', 'urls', multiple=True, help='URL to archive (can be repeated). May be a bare url or a JSON dict with a "url" key and an optional "output" key')
@click.option('--path', '-p', 'paths', multiple=True, type=click.Path(exists=True, path_type=Path), help='File or directory to archive (can be repeated)')
@click.option('--collect', '-c', 'collect', help='Collection tasks in JSON format')
@click.option('--hard-link', is_flag=True, help='Use hard links when copying files (when possible)')
@click.option('--info', '-i', multiple=True, help='bag-info.txt metadata in key:value format (can be repeated)')
@click.option('--signed-metadata', 'signed_metadata_path', type=click.Path(exists=True, path_type=Path, dir_okay=False),
Expand All @@ -43,6 +46,7 @@ def archive(
amend,
urls,
paths,
collect,
hard_link,
info,
signed_metadata_path,
Expand Down Expand Up @@ -98,9 +102,41 @@ def archive(
raise click.BadParameter(f'Metadata must be in "key:value" format, got "{item}"')
bag_info[key.strip()].append(value.strip())

# validate URLs
# Convert collect to list if it's a tuple
if collect:
try:
collect = json.loads(collect)
except json.JSONDecodeError:
raise click.BadParameter(f'Invalid JSON string for --collect: {collect}')
if not isinstance(collect, list):
raise click.BadParameter(f'--collect must be a list of JSON objects, got {collect}')
else:
collect = []

# Append --url and --path to --collect
for url in urls:
assert_url(url)
try:
url_dict = json.loads(url)
url_dict['backend'] = 'url'
except json.JSONDecodeError:
url_dict = {'backend': 'url', 'url': url}
collect.append(url_dict)
for path in paths:
collect.append({'backend': 'path', 'path': str(path)})

# Process and validate collect
processed_collect = []
for task in collect:
try:
processed_collect.append(CollectionTask.from_dict(task))
except Exception as e:
raise click.BadParameter(f'Invalid task definition for --collect: {task} resulted in {e}')

# handle --hard-link option
if hard_link:
for task in processed_collect:
if isinstance(task, PathCollectionTask):
task.hard_links = True

## handle --sign and --timestamp options
# order matters, so get ordered list of signature flags from sys.argv
Expand Down Expand Up @@ -139,8 +175,7 @@ def archive(

package(
output_path=bag_path,
paths=paths,
urls=urls,
collect=processed_collect,
bag_info=bag_info,
signatures=signatures,
signed_metadata=metadata['signed'],
Expand Down
46 changes: 10 additions & 36 deletions src/nabit/lib/archive.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
from pathlib import Path
import shutil
from datetime import date
import bagit
import os
from .utils import get_unique_path, noop
from .capture import validate_warc_headers, capture
from .sign import validate_signatures, KNOWN_TSAS, add_signatures
from .. import __version__
import hashlib
import json

# files to ignore when copying directories
IGNORE_PATTERNS = ['.DS_Store']
from .utils import noop
from .backends.url import validate_warc_headers
from .sign import validate_signatures, KNOWN_TSAS, add_signatures
from .. import __version__
from .backends.base import CollectionTask


def validate_bag_format(bag_path: Path, error, warn, success) -> None:
"""Verify bag format."""
Expand Down Expand Up @@ -54,34 +53,10 @@ def error(message: str, metadata: dict | None = None) -> None:
validate_bag_format(bag_path, error, warn, success)
validate_signatures(tagmanifest_path, error, warn, success)

def copy_paths(source_paths: list[Path | str], dest_dir: Path, use_hard_links: bool = False) -> None:
"""Copy paths to a destination directory, optionally using hard links."""
for path in source_paths:
path = Path(path)
dest_path = get_unique_path(dest_dir / path.name)
# can only use hard links if source and destination are on the same device
use_hard_links = use_hard_links and os.stat(path).st_dev == os.stat(dest_dir).st_dev
if path.is_file():
if use_hard_links:
os.link(path, dest_path)
else:
shutil.copy2(path, dest_path)
else:
copy_function = os.link if use_hard_links else shutil.copy2
# link directory contents recursively
shutil.copytree(
path,
dest_path,
dirs_exist_ok=True,
copy_function=copy_function,
ignore=shutil.ignore_patterns(*IGNORE_PATTERNS)
)

def package(
output_path: Path | str,
amend: bool = False,
urls: list[str] | None = None,
paths: list[Path | str] | None = None,
collect: list[CollectionTask] | None = None,
bag_info: dict | None = None,
signatures: list[dict] | None = None,
signed_metadata: dict | None = None,
Expand All @@ -105,10 +80,9 @@ def package(
files_path = data_path / 'files'
files_path.mkdir(exist_ok=True, parents=True)

if urls:
capture(urls, data_path / 'headers.warc')
if paths:
copy_paths(paths, files_path, use_hard_links)
if collect:
for task in collect:
task.collect(files_path)

# Add metadata files
if signed_metadata is not None:
Expand Down
Empty file.
20 changes: 20 additions & 0 deletions src/nabit/lib/backends/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from dataclasses import dataclass
from functools import lru_cache

@lru_cache
def get_backends() -> dict[str, type['CollectionTask']]:
# do this in a cached function to avoid circular import
from .url import UrlCollectionTask
from .path import PathCollectionTask

return {
'url': UrlCollectionTask,
'path': PathCollectionTask,
}

@dataclass
class CollectionTask:
@classmethod
def from_dict(cls, data: dict) -> 'CollectionTask':
backend = data.pop('backend')
return get_backends()[backend](**data)
45 changes: 45 additions & 0 deletions src/nabit/lib/backends/path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import shutil
from pathlib import Path
from dataclasses import dataclass
from ..utils import get_unique_path
from .base import CollectionTask

@dataclass
class PathCollectionTask(CollectionTask):
"""Collect files or directories from the local filesystem."""
path: Path
output: Path | None = None
hard_links: bool = False

ignore_patterns = ['.DS_Store']

def __post_init__(self):
"""Validate the path and ensure it's a Path object."""
self.path = Path(self.path) # Coerce to Path if it's a string
if not self.path.exists():
raise ValueError(f'Path "{self.path}" does not exist')
if self.output is not None:
self.output = Path(self.output) # Also coerce output if provided

def collect(self, files_dir: Path) -> None:
"""Copy paths to a destination directory, optionally using hard links."""
path = self.path
dest_path = get_unique_path(files_dir / path.name)
# can only use hard links if source and destination are on the same device
use_hard_links = self.hard_links and os.stat(path).st_dev == os.stat(files_dir).st_dev
if path.is_file():
if use_hard_links:
os.link(path, dest_path)
else:
shutil.copy2(path, dest_path)
else:
copy_function = os.link if use_hard_links else shutil.copy2
# link directory contents recursively
shutil.copytree(
path,
dest_path,
dirs_exist_ok=True,
copy_function=copy_function,
ignore=shutil.ignore_patterns(*self.ignore_patterns)
)
75 changes: 47 additions & 28 deletions src/nabit/lib/capture.py → src/nabit/lib/backends/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
from urllib.parse import urlparse
import mimetypes
from pathlib import Path
import requests # requests must be imported after capture_http
import requests
import os
from nabit.lib.utils import get_unique_path
from dataclasses import dataclass
from ..utils import get_unique_path
from .base import CollectionTask

"""
This file handles capturing of URLs and request/response metadata.
We use an unpacked WARC format to make it easier to access underlying data files.
Expand All @@ -29,11 +32,36 @@
even if the original response was gzip encoded in transit.
"""

@dataclass
class UrlCollectionTask(CollectionTask):
"""Collect URLs and request/response metadata."""
url: str
output: Path | None = None

def __post_init__(self):
"""Validate the URL by attempting to prepare a request."""
requests.Request('GET', self.url).prepare()

def collect(self, files_dir: Path) -> None:
"""
Capture URL to a WARC file using our custom FileWriter.
Appends to the WARC file if it already exists.
"""
warc_path = files_dir.parent / 'headers.warc'
with open(warc_path, 'ab') as fh:
warc_writer = FileWriter(fh, warc_path, gzip=False)
with capture_http(warc_writer):
warc_writer.custom_out_path = self.output
requests.get(self.url)


class FileWriter(WARCWriter):
"""
A WARC writer that stores response bodies uncompressed in the files/ directory.
"""
revisit_status_codes = set(['200', '203'])
custom_out_path = None # override output path

def __init__(self, filebuf, warc_path: Path, *args, **kwargs):
super(WARCWriter, self).__init__(*args, **kwargs)
self.out = filebuf
Expand All @@ -49,19 +77,23 @@ def _write_warc_record(self, out, record):
headers.replace_header('WARC-Type', 'revisit')

## get a filename for the response body
uri = headers.get_header('WARC-Target-URI')
parsed_url = urlparse(uri)
filename = Path(parsed_url.path.split('/')[-1])
# set stem
stem = filename.stem.lstrip('.') or 'data'
# set extension
extension = filename.suffix
if not extension:
if content_type := record.http_headers.get_header('Content-Type'): # pragma: no branch
extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False)
if not extension:
extension = '.unknown' # pragma: no cover
out_path = get_unique_path(self.files_path / f'{stem}{extension}')
if self.custom_out_path is not None:
out_path = self.custom_out_path
else:
uri = headers.get_header('WARC-Target-URI')
parsed_url = urlparse(uri)
filename = Path(parsed_url.path.split('/')[-1])
# set stem
stem = filename.stem.lstrip('.') or 'data'
# set extension
extension = filename.suffix
if not extension:
if content_type := record.http_headers.get_header('Content-Type'): # pragma: no branch
extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False)
if not extension:
extension = '.unknown' # pragma: no cover
out_path = f'{stem}{extension}'
out_path = get_unique_path(self.files_path / out_path)
relative_path = out_path.relative_to(self.warc_path.parent)

# add our custom WARC-Profile header
Expand Down Expand Up @@ -90,19 +122,6 @@ def _write_warc_record(self, out, record):

return super()._write_warc_record(out, record)


def capture(urls: list[str], warc_path: Path, request_kwargs: dict = {}) -> None:
"""
Capture a list of URLs to a WARC file using our custom FileWriter.
Appends to the WARC file if it already exists.
"""
use_gzip = str(warc_path).endswith('.gz')
with open(warc_path, 'ab') as fh:
warc_writer = FileWriter(fh, warc_path, gzip=use_gzip)
with capture_http(warc_writer):
for url in urls:
requests.get(url, **request_kwargs)

def validate_warc_headers(headers_path: Path, error, warn, success) -> None:
"""
Validate a headers.warc file created by capture().
Expand Down
Empty file added tests/backends/__init__.py
Empty file.
Loading

0 comments on commit ad1b8e4

Please sign in to comment.