Refactor to support collection backends, allowing for changing output…

… path
harvard-lil · Dec 4, 2024 · ad1b8e4 · ad1b8e4
1 parent 1dbaf9e
commit ad1b8e4
Show file tree

Hide file tree

Showing 13 changed files with 291 additions and 100 deletions.
diff --git a/README.md b/README.md
@@ -132,9 +132,12 @@ Usage:  [OPTIONS] BAG_PATH
 Options:
   -a, --amend                     Update an existing archive. May add OR
                                   OVERWRITE existing data.
-  -u, --url TEXT                  URL to archive (can be repeated)
+  -u, --url TEXT                  URL to archive (can be repeated). May be a
+                                  bare url or a JSON dict with a "url" key and
+                                  an optional "output" key
   -p, --path PATH                 File or directory to archive (can be
                                   repeated)
+  -c, --collect TEXT              Collection tasks in JSON format
   --hard-link                     Use hard links when copying files (when
                                   possible)
   -i, --info TEXT                 bag-info.txt metadata in key:value format
@@ -206,6 +209,29 @@ In many situations it may make sense to create and sign bags on different machin
   ```
 * The signed bag is then published to the archive, perhaps simply by copying the bag directory to a public file server.
 
+Collection backends
+-------------------
+
+`bag-nabit` is not primarily a web archiving tool, but it supports collection backends that can gather both web content and file content. Collection tasks can be provided as JSON content passed to the `--collect` flag to `nabit archive`:
+
+```
+nabit archive example_bag --collect '[
+  {"backend": "url", "url": "https://example.com/", "output": "example_com.html"},
+  {"backend": "path", "path": "/path/to/local/file"}
+]'
+```
+
+Currently supported collection backends are:
+
+* `url`: fetch URLs with python `requests`, following redirects. Write metadata to data/headers.warc. Equivalent to the `-u` flag to `nabit archive`. Keys:
+  * `url`: the URL to fetch
+  * `output` (optional): the path to save the fetched content to in the bag, relative to `data/files/`. If not provided, the content will be saved to `data/files/<url_path>`, where `<url_path>` is the last path component of the URL.
+* `path`: copy local files or directories to the bag. Equivalent to the `-p` flag to `nabit archive`. Keys:
+  * `path`: the path to the local file or directory to copy
+  * `output` (optional): the path to save the fetched content to in the bag.
+
+Future backends could include ftp, web crawlers, etc.
+
 File format
 -----------
 

diff --git a/src/nabit/bin/cli.py b/src/nabit/bin/cli.py
@@ -4,8 +4,10 @@
 from pathlib import Path
 
 from .utils import assert_file_exists, assert_url, cli_validate, CaptureCommand
-from ..lib.archive import package, validate_package
+from ..lib.archive import package
 from ..lib.sign import KNOWN_TSAS
+from ..lib.backends.base import CollectionTask
+from ..lib.backends.path import PathCollectionTask
 
 @click.group()
 def main():
@@ -16,8 +18,9 @@ def main():
 @main.command(cls=CaptureCommand)
 @click.argument('bag_path', type=click.Path(path_type=Path))
 @click.option('--amend', '-a', is_flag=True, help='Update an existing archive. May add OR OVERWRITE existing data.')
-@click.option('--url', '-u', 'urls', multiple=True, help='URL to archive (can be repeated)')
+@click.option('--url', '-u', 'urls', multiple=True, help='URL to archive (can be repeated). May be a bare url or a JSON dict with a "url" key and an optional "output" key')
 @click.option('--path', '-p', 'paths', multiple=True, type=click.Path(exists=True, path_type=Path), help='File or directory to archive (can be repeated)')
+@click.option('--collect', '-c', 'collect', help='Collection tasks in JSON format')
 @click.option('--hard-link', is_flag=True, help='Use hard links when copying files (when possible)')
 @click.option('--info', '-i', multiple=True, help='bag-info.txt metadata in key:value format (can be repeated)')
 @click.option('--signed-metadata', 'signed_metadata_path', type=click.Path(exists=True, path_type=Path, dir_okay=False),
@@ -43,6 +46,7 @@ def archive(
     amend,
     urls,
     paths,
+    collect,
     hard_link,
     info,
     signed_metadata_path,
@@ -98,9 +102,41 @@ def archive(
             raise click.BadParameter(f'Metadata must be in "key:value" format, got "{item}"')
         bag_info[key.strip()].append(value.strip())
 
-    # validate URLs
+    # Convert collect to list if it's a tuple
+    if collect:
+        try:
+            collect = json.loads(collect)
+        except json.JSONDecodeError:
+            raise click.BadParameter(f'Invalid JSON string for --collect: {collect}')
+        if not isinstance(collect, list):
+            raise click.BadParameter(f'--collect must be a list of JSON objects, got {collect}')
+    else:
+        collect = []
+
+    # Append --url and --path to --collect
     for url in urls:
-        assert_url(url)
+        try:
+            url_dict = json.loads(url)
+            url_dict['backend'] = 'url'
+        except json.JSONDecodeError:
+            url_dict = {'backend': 'url', 'url': url}
+        collect.append(url_dict)
+    for path in paths:
+        collect.append({'backend': 'path', 'path': str(path)})
+
+    # Process and validate collect
+    processed_collect = []
+    for task in collect:
+        try:
+            processed_collect.append(CollectionTask.from_dict(task))
+        except Exception as e:
+            raise click.BadParameter(f'Invalid task definition for --collect: {task} resulted in {e}')
+
+    # handle --hard-link option
+    if hard_link:
+        for task in processed_collect:
+            if isinstance(task, PathCollectionTask):
+                task.hard_links = True
 
     ## handle --sign and --timestamp options
     # order matters, so get ordered list of signature flags from sys.argv
@@ -139,8 +175,7 @@ def archive(
 
     package(
         output_path=bag_path,
-        paths=paths,
-        urls=urls,
+        collect=processed_collect,
         bag_info=bag_info,
         signatures=signatures,
         signed_metadata=metadata['signed'],

diff --git a/src/nabit/lib/archive.py b/src/nabit/lib/archive.py
@@ -1,17 +1,16 @@
 from pathlib import Path
-import shutil
 from datetime import date
 import bagit
 import os
-from .utils import get_unique_path, noop
-from .capture import validate_warc_headers, capture
-from .sign import validate_signatures, KNOWN_TSAS, add_signatures
-from .. import __version__
 import hashlib
 import json
 
-# files to ignore when copying directories
-IGNORE_PATTERNS = ['.DS_Store']
+from .utils import noop
+from .backends.url import validate_warc_headers
+from .sign import validate_signatures, KNOWN_TSAS, add_signatures
+from .. import __version__
+from .backends.base import CollectionTask
+
 
 def validate_bag_format(bag_path: Path, error, warn, success) -> None:
     """Verify bag format."""
@@ -54,34 +53,10 @@ def error(message: str, metadata: dict | None = None) -> None:
     validate_bag_format(bag_path, error, warn, success)
     validate_signatures(tagmanifest_path, error, warn, success)
 
-def copy_paths(source_paths: list[Path | str], dest_dir: Path, use_hard_links: bool = False) -> None:
-    """Copy paths to a destination directory, optionally using hard links."""
-    for path in source_paths:
-        path = Path(path)
-        dest_path = get_unique_path(dest_dir / path.name)
-        # can only use hard links if source and destination are on the same device
-        use_hard_links = use_hard_links and os.stat(path).st_dev == os.stat(dest_dir).st_dev
-        if path.is_file():
-            if use_hard_links:
-                os.link(path, dest_path)
-            else:
-                shutil.copy2(path, dest_path)
-        else:
-            copy_function = os.link if use_hard_links else shutil.copy2
-            # link directory contents recursively
-            shutil.copytree(
-                path, 
-                dest_path, 
-                dirs_exist_ok=True, 
-                copy_function=copy_function, 
-                ignore=shutil.ignore_patterns(*IGNORE_PATTERNS)
-            )
-
 def package(
     output_path: Path | str,
     amend: bool = False,
-    urls: list[str] | None = None,
-    paths: list[Path | str] | None = None,
+    collect: list[CollectionTask] | None = None,
     bag_info: dict | None = None,
     signatures: list[dict] | None = None,
     signed_metadata: dict | None = None,
@@ -105,10 +80,9 @@ def package(
     files_path = data_path / 'files'
     files_path.mkdir(exist_ok=True, parents=True)
 
-    if urls:
-        capture(urls, data_path / 'headers.warc')
-    if paths:
-        copy_paths(paths, files_path, use_hard_links)
+    if collect:
+        for task in collect:
+            task.collect(files_path)
 
     # Add metadata files
     if signed_metadata is not None:

diff --git a/src/nabit/lib/backends/__init__.py b/src/nabit/lib/backends/__init__.py
diff --git a/src/nabit/lib/backends/base.py b/src/nabit/lib/backends/base.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from functools import lru_cache
+
+@lru_cache
+def get_backends() -> dict[str, type['CollectionTask']]:
+    # do this in a cached function to avoid circular import
+    from .url import UrlCollectionTask
+    from .path import PathCollectionTask
+
+    return {
+        'url': UrlCollectionTask,
+        'path': PathCollectionTask,
+    }
+
+@dataclass
+class CollectionTask:
+    @classmethod
+    def from_dict(cls, data: dict) -> 'CollectionTask':
+        backend = data.pop('backend')
+        return get_backends()[backend](**data)
diff --git a/src/nabit/lib/backends/path.py b/src/nabit/lib/backends/path.py
@@ -0,0 +1,45 @@
+import os
+import shutil
+from pathlib import Path
+from dataclasses import dataclass
+from ..utils import get_unique_path
+from .base import CollectionTask
+
+@dataclass
+class PathCollectionTask(CollectionTask):
+    """Collect files or directories from the local filesystem."""
+    path: Path
+    output: Path | None = None
+    hard_links: bool = False
+
+    ignore_patterns = ['.DS_Store']
+
+    def __post_init__(self):
+        """Validate the path and ensure it's a Path object."""
+        self.path = Path(self.path)  # Coerce to Path if it's a string
+        if not self.path.exists():
+            raise ValueError(f'Path "{self.path}" does not exist')
+        if self.output is not None:
+            self.output = Path(self.output)  # Also coerce output if provided
+
+    def collect(self, files_dir: Path) -> None:
+        """Copy paths to a destination directory, optionally using hard links."""
+        path = self.path
+        dest_path = get_unique_path(files_dir / path.name)
+        # can only use hard links if source and destination are on the same device
+        use_hard_links = self.hard_links and os.stat(path).st_dev == os.stat(files_dir).st_dev
+        if path.is_file():
+            if use_hard_links:
+                os.link(path, dest_path)
+            else:
+                shutil.copy2(path, dest_path)
+        else:
+            copy_function = os.link if use_hard_links else shutil.copy2
+            # link directory contents recursively
+            shutil.copytree(
+                path,
+                dest_path,
+                dirs_exist_ok=True,
+                copy_function=copy_function,
+                ignore=shutil.ignore_patterns(*self.ignore_patterns)
+            )
diff --git a/src/nabit/lib/capture.py → src/nabit/lib/backends/url.py b/src/nabit/lib/capture.py → src/nabit/lib/backends/url.py
@@ -4,9 +4,12 @@
 from urllib.parse import urlparse
 import mimetypes
 from pathlib import Path
-import requests  # requests must be imported after capture_http
+import requests
 import os
-from nabit.lib.utils import get_unique_path
+from dataclasses import dataclass
+from ..utils import get_unique_path
+from .base import CollectionTask
+
 """
 This file handles capturing of URLs and request/response metadata.
 We use an unpacked WARC format to make it easier to access underlying data files.
@@ -29,11 +32,36 @@
   even if the original response was gzip encoded in transit.
 """
 
+@dataclass
+class UrlCollectionTask(CollectionTask):
+    """Collect URLs and request/response metadata."""
+    url: str
+    output: Path | None = None
+
+    def __post_init__(self):
+        """Validate the URL by attempting to prepare a request."""
+        requests.Request('GET', self.url).prepare()
+
+    def collect(self, files_dir: Path) -> None:
+        """
+        Capture URL to a WARC file using our custom FileWriter.
+        Appends to the WARC file if it already exists.
+        """
+        warc_path = files_dir.parent / 'headers.warc'
+        with open(warc_path, 'ab') as fh:
+            warc_writer = FileWriter(fh, warc_path, gzip=False)
+            with capture_http(warc_writer):
+                warc_writer.custom_out_path = self.output
+                requests.get(self.url)
+
+
 class FileWriter(WARCWriter):
     """
     A WARC writer that stores response bodies uncompressed in the files/ directory.
     """
     revisit_status_codes = set(['200', '203'])
+    custom_out_path = None  # override output path
+
     def __init__(self, filebuf, warc_path: Path, *args, **kwargs):
         super(WARCWriter, self).__init__(*args, **kwargs)
         self.out = filebuf
@@ -49,19 +77,23 @@ def _write_warc_record(self, out, record):
             headers.replace_header('WARC-Type', 'revisit')
 
             ## get a filename for the response body
-            uri = headers.get_header('WARC-Target-URI')
-            parsed_url = urlparse(uri)
-            filename = Path(parsed_url.path.split('/')[-1])
-            # set stem
-            stem = filename.stem.lstrip('.') or 'data'
-            # set extension
-            extension = filename.suffix
-            if not extension:
-                if content_type := record.http_headers.get_header('Content-Type'):  # pragma: no branch
-                    extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False)
-                if not extension:  
-                    extension = '.unknown'  # pragma: no cover
-            out_path = get_unique_path(self.files_path / f'{stem}{extension}')
+            if self.custom_out_path is not None:
+                out_path = self.custom_out_path
+            else:
+                uri = headers.get_header('WARC-Target-URI')
+                parsed_url = urlparse(uri)
+                filename = Path(parsed_url.path.split('/')[-1])
+                # set stem
+                stem = filename.stem.lstrip('.') or 'data'
+                # set extension
+                extension = filename.suffix
+                if not extension:
+                    if content_type := record.http_headers.get_header('Content-Type'):  # pragma: no branch
+                        extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False)
+                    if not extension:  
+                        extension = '.unknown'  # pragma: no cover
+                out_path = f'{stem}{extension}'
+            out_path = get_unique_path(self.files_path / out_path)
             relative_path = out_path.relative_to(self.warc_path.parent)
 
             # add our custom WARC-Profile header
@@ -90,19 +122,6 @@ def _write_warc_record(self, out, record):
 
         return super()._write_warc_record(out, record)
 
-
-def capture(urls: list[str], warc_path: Path, request_kwargs: dict = {}) -> None:
-    """
-    Capture a list of URLs to a WARC file using our custom FileWriter.
-    Appends to the WARC file if it already exists.
-    """
-    use_gzip = str(warc_path).endswith('.gz')
-    with open(warc_path, 'ab') as fh:
-        warc_writer = FileWriter(fh, warc_path, gzip=use_gzip)
-        with capture_http(warc_writer):
-            for url in urls:
-                requests.get(url, **request_kwargs)
-
 def validate_warc_headers(headers_path: Path, error, warn, success) -> None:
     """
     Validate a headers.warc file created by capture().

diff --git a/tests/backends/__init__.py b/tests/backends/__init__.py