Skip to content

Commit

Permalink
Add collection_tasks and id to signed-metadata.json
Browse files Browse the repository at this point in the history
* Also add --collect-errors flag to control error handling
* Also add --timeout flag to set timeouts for URLs
  • Loading branch information
jcushman committed Dec 5, 2024
1 parent bb9821e commit 0e7e5af
Show file tree
Hide file tree
Showing 13 changed files with 251 additions and 49 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ Options:
-t, --timestamp <tsa_keyword> | <cert_chain>:<url>
Timestamp using either a TSA keyword or a
cert chain path and URL (can be repeated)
--timeout FLOAT Timeout for collection tasks (default: 5.0)
--collect-errors [fail|ignore] How to handle collection task errors
(default: fail)
--help Show this message and exit.
```

Expand Down
26 changes: 19 additions & 7 deletions src/nabit/bin/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .utils import assert_file_exists, assert_url, cli_validate, CaptureCommand
from ..lib.archive import package
from ..lib.sign import KNOWN_TSAS
from ..lib.backends.base import CollectionTask
from ..lib.backends.base import CollectionTask, CollectionError
from ..lib.backends.path import PathCollectionTask

@click.group()
Expand Down Expand Up @@ -39,6 +39,10 @@ def main():
help='Timestamp using either a TSA keyword or a cert chain path and URL (can be repeated)',
metavar='<tsa_keyword> | <cert_chain>:<url>',
)
@click.option('--timeout', type=float, default=5.0,
help='Timeout for collection tasks (default: 5.0)')
@click.option('--collect-errors', type=click.Choice(['fail', 'ignore']), default='fail',
help='How to handle collection task errors (default: fail)')
@click.pass_context
def archive(
ctx,
Expand All @@ -53,7 +57,9 @@ def archive(
unsigned_metadata_path,
signed_metadata_json,
unsigned_metadata_json,
signature_args
signature_args,
collect_errors,
timeout,
):
"""
Archive files and URLs into a BagIt package.
Expand Down Expand Up @@ -128,7 +134,10 @@ def archive(
processed_collect = []
for task in collect:
try:
processed_collect.append(CollectionTask.from_dict(task))
task = CollectionTask.from_dict(task)
if hasattr(task, 'timeout'):
task.timeout = timeout
processed_collect.append(task)
except Exception as e:
raise click.BadParameter(f'Invalid task definition for --collect: {task} resulted in {e}')

Expand Down Expand Up @@ -173,16 +182,19 @@ def archive(

click.echo(f"Creating package at {bag_path} ...")

package(
output_path=bag_path,
try:
package(
output_path=bag_path,
collect=processed_collect,
bag_info=bag_info,
signatures=signatures,
signed_metadata=metadata['signed'],
unsigned_metadata=metadata['unsigned'],
amend=amend,
use_hard_links=hard_link,
)
collect_errors=collect_errors,
)
except CollectionError as e:
raise click.BadParameter(f'Collection task failed: {e}')

cli_validate(bag_path)

Expand Down
37 changes: 31 additions & 6 deletions src/nabit/lib/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import os
import hashlib
import json

import uuid
from .utils import noop
from .backends.url import validate_warc_headers
from .sign import validate_signatures, KNOWN_TSAS, add_signatures
from .. import __version__
from .backends.base import CollectionTask
from .backends.base import CollectionTask, CollectionError
from typing import Literal


def validate_bag_format(bag_path: Path, error, warn, success) -> None:
Expand All @@ -27,6 +28,13 @@ def validate_bag_format(bag_path: Path, error, warn, success) -> None:

def validate_data_files(bag_path: Path, error = None, warn = noop, success = noop) -> None:
"""Validate only expected files are present in data/."""

# make sure there are files in files_path
files_path = bag_path / "data/files"
if not files_path.exists() or not any(files_path.iterdir()):
warn("No files in data/files")

# make sure only expected files are present
expected_files = set(['files', 'headers.warc', 'signed-metadata.json'])
actual_files = set(f.name for f in bag_path.glob('data/*'))
unexpected_files = actual_files - expected_files
Expand Down Expand Up @@ -61,7 +69,7 @@ def package(
signatures: list[dict] | None = None,
signed_metadata: dict | None = None,
unsigned_metadata: dict | None = None,
use_hard_links: bool = False,
collect_errors: Literal['fail', 'ignore'] = 'fail',
) -> None:
"""
Create a BagIt package.
Expand All @@ -79,15 +87,32 @@ def package(
data_path = output_path / 'data'
files_path = data_path / 'files'
files_path.mkdir(exist_ok=True, parents=True)
signed_metadata_path = data_path / "signed-metadata.json"

# set or extend signed metadata
if signed_metadata is None:
if signed_metadata_path.exists():
signed_metadata = json.loads(signed_metadata_path.read_text())
else:
signed_metadata = {}

if not signed_metadata.get('id'):
signed_metadata['id'] = str(uuid.uuid4())

# run collection tasks and record results
if collect:
results = []
for task in collect:
task.collect(files_path)
result = task.collect(files_path)
if collect_errors == 'fail' and not result['response']['success']:
raise CollectionError(f"Collection task failed: {result}")
results.append(result)
signed_metadata.setdefault('collection_tasks', []).extend(results)

# Add metadata files
if signed_metadata is not None:
if signed_metadata:
(data_path / "signed-metadata.json").write_text(json.dumps(signed_metadata, indent=2))
if unsigned_metadata is not None:
if unsigned_metadata:
(output_path / "unsigned-metadata.json").write_text(json.dumps(unsigned_metadata, indent=2))

## add bag files
Expand Down
26 changes: 23 additions & 3 deletions src/nabit/lib/backends/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from dataclasses import dataclass, asdict
from functools import lru_cache
from pathlib import Path

@lru_cache
def get_backends() -> dict[str, type['CollectionTask']]:
Expand All @@ -8,13 +9,32 @@ def get_backends() -> dict[str, type['CollectionTask']]:
from .path import PathCollectionTask

return {
'url': UrlCollectionTask,
'path': PathCollectionTask,
UrlCollectionTask.backend: UrlCollectionTask,
PathCollectionTask.backend: PathCollectionTask,
}

class CollectionError(Exception):
"""Base class for collection errors"""

@dataclass
class CollectionTask:
@classmethod
def from_dict(cls, data: dict) -> 'CollectionTask':
backend = data.pop('backend')
return get_backends()[backend](**data)

def collect(self, files_dir: Path) -> dict:
"""Call the backend-specific _collect method and return the result, handling any errors."""
try:
result = self._collect(files_dir)
result['success'] = True
except Exception as e:
result = {'success': False, 'error': str(e)}
return {
'request': self.request_dict(),
'response': result,
}

def _collect(self, files_dir: Path) -> dict:
"""Collect the data to the given directory."""
raise NotImplementedError
14 changes: 13 additions & 1 deletion src/nabit/lib/backends/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
@dataclass
class PathCollectionTask(CollectionTask):
"""Collect files or directories from the local filesystem."""
backend = 'path'

path: Path
output: Path | None = None
hard_links: bool = False
Expand All @@ -22,7 +24,7 @@ def __post_init__(self):
if self.output is not None:
self.output = Path(self.output) # Also coerce output if provided

def collect(self, files_dir: Path) -> None:
def _collect(self, files_dir: Path) -> Path:
"""Copy paths to a destination directory, optionally using hard links."""
path = self.path
dest_path = get_unique_path(files_dir / path.name)
Expand All @@ -43,3 +45,13 @@ def collect(self, files_dir: Path) -> None:
copy_function=copy_function,
ignore=shutil.ignore_patterns(*self.ignore_patterns)
)
return {'path': str(dest_path.relative_to(files_dir))}

def request_dict(self) -> dict:
"""Return a dictionary representation of the request."""
return {
'path': str(self.path),
'output': str(self.output) if self.output else None,
'hard_links': self.hard_links,
'ignore_patterns': self.ignore_patterns,
}
18 changes: 14 additions & 4 deletions src/nabit/lib/backends/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
@dataclass
class UrlCollectionTask(CollectionTask):
"""Collect URLs and request/response metadata."""
backend = 'url'

url: str
output: Path | None = None

Expand All @@ -44,7 +46,7 @@ def __post_init__(self):
"""Validate the URL by attempting to prepare a request."""
requests.Request('GET', self.url).prepare()

def collect(self, files_dir: Path) -> None:
def _collect(self, files_dir: Path) -> None:
"""
Capture URL to a WARC file using our custom FileWriter.
Appends to the WARC file if it already exists.
Expand All @@ -55,6 +57,15 @@ def collect(self, files_dir: Path) -> None:
with capture_http(warc_writer):
warc_writer.custom_out_path = self.output
requests.get(self.url, timeout=self.timeout)
return {'path': str(warc_writer.result_path)}

def request_dict(self) -> dict:
"""Return a dictionary representation of the request."""
return {
'url': self.url,
'output': str(self.output) if self.output else None,
'timeout': self.timeout,
}


class FileWriter(WARCWriter):
Expand All @@ -63,6 +74,7 @@ class FileWriter(WARCWriter):
"""
revisit_status_codes = set(['200', '203'])
custom_out_path = None # override output path
result_path = None

def __init__(self, filebuf, warc_path: Path, *args, **kwargs):
super(WARCWriter, self).__init__(*args, **kwargs)
Expand Down Expand Up @@ -97,6 +109,7 @@ def _write_warc_record(self, out, record):
out_path = f'{stem}{extension}'
out_path = get_unique_path(self.files_path / out_path)
relative_path = out_path.relative_to(self.warc_path.parent)
self.result_path = out_path.relative_to(self.files_path)

# add our custom WARC-Profile header
headers.add_header('WARC-Profile', f'file-content; filename="{relative_path}"')
Expand Down Expand Up @@ -136,9 +149,6 @@ def validate_warc_headers(headers_path: Path, error, warn, success) -> None:
data_path = headers_path.parent
files_path = data_path / "files"

# make sure there are files in files_path
if not files_path.exists() or not any(files_path.iterdir()):
error("No files in data/files")
if not headers_path.exists():
warn("No headers.warc found; archive lacks request and response metadata")
else:
Expand Down
21 changes: 19 additions & 2 deletions tests/backends/test_path_backend.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from nabit.lib.backends.path import PathCollectionTask

from inline_snapshot import snapshot
from ..utils import filter_str

def test_ds_store_ignored(tmp_path):
"""Test that files in ignore_patterns are ignored when copying directories"""
Expand All @@ -14,7 +15,23 @@ def test_ds_store_ignored(tmp_path):
dest_dir.mkdir()

# Test copying
PathCollectionTask(path=str(source_dir)).collect(dest_dir)
response = PathCollectionTask(path=str(source_dir)).collect(dest_dir)
assert filter_str(response, path=tmp_path) == snapshot("""\
{
"request": {
"path": "<path>/test_dir",
"output": null,
"hard_links": false,
"ignore_patterns": [
".DS_Store"
]
},
"response": {
"path": "test_dir",
"success": true
}
}\
""")

# Verify results
assert not (dest_dir / "test_dir/.DS_Store").exists()
Expand Down
Loading

0 comments on commit 0e7e5af

Please sign in to comment.