From ad1b8e4ea4c190c8bbfc4717cc9145c6746a1615 Mon Sep 17 00:00:00 2001
From: Jack Cushman <jcushman@law.harvard.edu>
Date: Wed, 4 Dec 2024 14:23:54 -0500
Subject: [PATCH] Refactor to support collection backends, allowing for
 changing output path

---
 README.md                                     | 28 ++++++-
 src/nabit/bin/cli.py                          | 47 ++++++++++--
 src/nabit/lib/archive.py                      | 46 +++---------
 src/nabit/lib/backends/__init__.py            |  0
 src/nabit/lib/backends/base.py                | 20 +++++
 src/nabit/lib/backends/path.py                | 45 +++++++++++
 src/nabit/lib/{capture.py => backends/url.py} | 75 ++++++++++++-------
 tests/backends/__init__.py                    |  0
 .../test_path_backend.py}                     |  9 +--
 tests/backends/test_url_backend.py            | 21 ++++++
 tests/conftest.py                             | 17 ++++-
 tests/test_archive.py                         | 21 +-----
 tests/test_cli.py                             | 62 +++++++++++++++
 13 files changed, 291 insertions(+), 100 deletions(-)
 create mode 100644 src/nabit/lib/backends/__init__.py
 create mode 100644 src/nabit/lib/backends/base.py
 create mode 100644 src/nabit/lib/backends/path.py
 rename src/nabit/lib/{capture.py => backends/url.py} (73%)
 create mode 100644 tests/backends/__init__.py
 rename tests/{test_capture.py => backends/test_path_backend.py} (90%)
 create mode 100644 tests/backends/test_url_backend.py

diff --git a/README.md b/README.md
index dda0cfe..2ca0188 100644
--- a/README.md
+++ b/README.md
@@ -132,9 +132,12 @@ Usage:  [OPTIONS] BAG_PATH
 Options:
   -a, --amend                     Update an existing archive. May add OR
                                   OVERWRITE existing data.
-  -u, --url TEXT                  URL to archive (can be repeated)
+  -u, --url TEXT                  URL to archive (can be repeated). May be a
+                                  bare url or a JSON dict with a "url" key and
+                                  an optional "output" key
   -p, --path PATH                 File or directory to archive (can be
                                   repeated)
+  -c, --collect TEXT              Collection tasks in JSON format
   --hard-link                     Use hard links when copying files (when
                                   possible)
   -i, --info TEXT                 bag-info.txt metadata in key:value format
@@ -206,6 +209,29 @@ In many situations it may make sense to create and sign bags on different machin
   ```
 * The signed bag is then published to the archive, perhaps simply by copying the bag directory to a public file server.
 
+Collection backends
+-------------------
+
+`bag-nabit` is not primarily a web archiving tool, but it supports collection backends that can gather both web content and file content. Collection tasks can be provided as JSON content passed to the `--collect` flag to `nabit archive`:
+
+```
+nabit archive example_bag --collect '[
+  {"backend": "url", "url": "https://example.com/", "output": "example_com.html"},
+  {"backend": "path", "path": "/path/to/local/file"}
+]'
+```
+
+Currently supported collection backends are:
+
+* `url`: fetch URLs with python `requests`, following redirects. Write metadata to data/headers.warc. Equivalent to the `-u` flag to `nabit archive`. Keys:
+  * `url`: the URL to fetch
+  * `output` (optional): the path to save the fetched content to in the bag, relative to `data/files/`. If not provided, the content will be saved to `data/files/<url_path>`, where `<url_path>` is the last path component of the URL.
+* `path`: copy local files or directories to the bag. Equivalent to the `-p` flag to `nabit archive`. Keys:
+  * `path`: the path to the local file or directory to copy
+  * `output` (optional): the path to save the fetched content to in the bag.
+
+Future backends could include ftp, web crawlers, etc.
+
 File format
 -----------
 
diff --git a/src/nabit/bin/cli.py b/src/nabit/bin/cli.py
index d90ad22..aafdc52 100644
--- a/src/nabit/bin/cli.py
+++ b/src/nabit/bin/cli.py
@@ -4,8 +4,10 @@
 from pathlib import Path
 
 from .utils import assert_file_exists, assert_url, cli_validate, CaptureCommand
-from ..lib.archive import package, validate_package
+from ..lib.archive import package
 from ..lib.sign import KNOWN_TSAS
+from ..lib.backends.base import CollectionTask
+from ..lib.backends.path import PathCollectionTask
 
 @click.group()
 def main():
@@ -16,8 +18,9 @@ def main():
 @main.command(cls=CaptureCommand)
 @click.argument('bag_path', type=click.Path(path_type=Path))
 @click.option('--amend', '-a', is_flag=True, help='Update an existing archive. May add OR OVERWRITE existing data.')
-@click.option('--url', '-u', 'urls', multiple=True, help='URL to archive (can be repeated)')
+@click.option('--url', '-u', 'urls', multiple=True, help='URL to archive (can be repeated). May be a bare url or a JSON dict with a "url" key and an optional "output" key')
 @click.option('--path', '-p', 'paths', multiple=True, type=click.Path(exists=True, path_type=Path), help='File or directory to archive (can be repeated)')
+@click.option('--collect', '-c', 'collect', help='Collection tasks in JSON format')
 @click.option('--hard-link', is_flag=True, help='Use hard links when copying files (when possible)')
 @click.option('--info', '-i', multiple=True, help='bag-info.txt metadata in key:value format (can be repeated)')
 @click.option('--signed-metadata', 'signed_metadata_path', type=click.Path(exists=True, path_type=Path, dir_okay=False),
@@ -43,6 +46,7 @@ def archive(
     amend,
     urls,
     paths,
+    collect,
     hard_link,
     info,
     signed_metadata_path,
@@ -98,9 +102,41 @@ def archive(
             raise click.BadParameter(f'Metadata must be in "key:value" format, got "{item}"')
         bag_info[key.strip()].append(value.strip())
 
-    # validate URLs
+    # Convert collect to list if it's a tuple
+    if collect:
+        try:
+            collect = json.loads(collect)
+        except json.JSONDecodeError:
+            raise click.BadParameter(f'Invalid JSON string for --collect: {collect}')
+        if not isinstance(collect, list):
+            raise click.BadParameter(f'--collect must be a list of JSON objects, got {collect}')
+    else:
+        collect = []
+
+    # Append --url and --path to --collect
     for url in urls:
-        assert_url(url)
+        try:
+            url_dict = json.loads(url)
+            url_dict['backend'] = 'url'
+        except json.JSONDecodeError:
+            url_dict = {'backend': 'url', 'url': url}
+        collect.append(url_dict)
+    for path in paths:
+        collect.append({'backend': 'path', 'path': str(path)})
+
+    # Process and validate collect
+    processed_collect = []
+    for task in collect:
+        try:
+            processed_collect.append(CollectionTask.from_dict(task))
+        except Exception as e:
+            raise click.BadParameter(f'Invalid task definition for --collect: {task} resulted in {e}')
+
+    # handle --hard-link option
+    if hard_link:
+        for task in processed_collect:
+            if isinstance(task, PathCollectionTask):
+                task.hard_links = True
 
     ## handle --sign and --timestamp options
     # order matters, so get ordered list of signature flags from sys.argv
@@ -139,8 +175,7 @@ def archive(
 
     package(
         output_path=bag_path,
-        paths=paths,
-        urls=urls,
+        collect=processed_collect,
         bag_info=bag_info,
         signatures=signatures,
         signed_metadata=metadata['signed'],
diff --git a/src/nabit/lib/archive.py b/src/nabit/lib/archive.py
index f5214eb..243cb7f 100644
--- a/src/nabit/lib/archive.py
+++ b/src/nabit/lib/archive.py
@@ -1,17 +1,16 @@
 from pathlib import Path
-import shutil
 from datetime import date
 import bagit
 import os
-from .utils import get_unique_path, noop
-from .capture import validate_warc_headers, capture
-from .sign import validate_signatures, KNOWN_TSAS, add_signatures
-from .. import __version__
 import hashlib
 import json
 
-# files to ignore when copying directories
-IGNORE_PATTERNS = ['.DS_Store']
+from .utils import noop
+from .backends.url import validate_warc_headers
+from .sign import validate_signatures, KNOWN_TSAS, add_signatures
+from .. import __version__
+from .backends.base import CollectionTask
+
 
 def validate_bag_format(bag_path: Path, error, warn, success) -> None:
     """Verify bag format."""
@@ -54,34 +53,10 @@ def error(message: str, metadata: dict | None = None) -> None:
     validate_bag_format(bag_path, error, warn, success)
     validate_signatures(tagmanifest_path, error, warn, success)
 
-def copy_paths(source_paths: list[Path | str], dest_dir: Path, use_hard_links: bool = False) -> None:
-    """Copy paths to a destination directory, optionally using hard links."""
-    for path in source_paths:
-        path = Path(path)
-        dest_path = get_unique_path(dest_dir / path.name)
-        # can only use hard links if source and destination are on the same device
-        use_hard_links = use_hard_links and os.stat(path).st_dev == os.stat(dest_dir).st_dev
-        if path.is_file():
-            if use_hard_links:
-                os.link(path, dest_path)
-            else:
-                shutil.copy2(path, dest_path)
-        else:
-            copy_function = os.link if use_hard_links else shutil.copy2
-            # link directory contents recursively
-            shutil.copytree(
-                path, 
-                dest_path, 
-                dirs_exist_ok=True, 
-                copy_function=copy_function, 
-                ignore=shutil.ignore_patterns(*IGNORE_PATTERNS)
-            )
-
 def package(
     output_path: Path | str,
     amend: bool = False,
-    urls: list[str] | None = None,
-    paths: list[Path | str] | None = None,
+    collect: list[CollectionTask] | None = None,
     bag_info: dict | None = None,
     signatures: list[dict] | None = None,
     signed_metadata: dict | None = None,
@@ -105,10 +80,9 @@ def package(
     files_path = data_path / 'files'
     files_path.mkdir(exist_ok=True, parents=True)
 
-    if urls:
-        capture(urls, data_path / 'headers.warc')
-    if paths:
-        copy_paths(paths, files_path, use_hard_links)
+    if collect:
+        for task in collect:
+            task.collect(files_path)
 
     # Add metadata files
     if signed_metadata is not None:
diff --git a/src/nabit/lib/backends/__init__.py b/src/nabit/lib/backends/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/nabit/lib/backends/base.py b/src/nabit/lib/backends/base.py
new file mode 100644
index 0000000..2b8e47e
--- /dev/null
+++ b/src/nabit/lib/backends/base.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from functools import lru_cache
+
+@lru_cache
+def get_backends() -> dict[str, type['CollectionTask']]:
+    # do this in a cached function to avoid circular import
+    from .url import UrlCollectionTask
+    from .path import PathCollectionTask
+
+    return {
+        'url': UrlCollectionTask,
+        'path': PathCollectionTask,
+    }
+
+@dataclass
+class CollectionTask:
+    @classmethod
+    def from_dict(cls, data: dict) -> 'CollectionTask':
+        backend = data.pop('backend')
+        return get_backends()[backend](**data)
diff --git a/src/nabit/lib/backends/path.py b/src/nabit/lib/backends/path.py
new file mode 100644
index 0000000..5170a0b
--- /dev/null
+++ b/src/nabit/lib/backends/path.py
@@ -0,0 +1,45 @@
+import os
+import shutil
+from pathlib import Path
+from dataclasses import dataclass
+from ..utils import get_unique_path
+from .base import CollectionTask
+
+@dataclass
+class PathCollectionTask(CollectionTask):
+    """Collect files or directories from the local filesystem."""
+    path: Path
+    output: Path | None = None
+    hard_links: bool = False
+
+    ignore_patterns = ['.DS_Store']
+
+    def __post_init__(self):
+        """Validate the path and ensure it's a Path object."""
+        self.path = Path(self.path)  # Coerce to Path if it's a string
+        if not self.path.exists():
+            raise ValueError(f'Path "{self.path}" does not exist')
+        if self.output is not None:
+            self.output = Path(self.output)  # Also coerce output if provided
+
+    def collect(self, files_dir: Path) -> None:
+        """Copy paths to a destination directory, optionally using hard links."""
+        path = self.path
+        dest_path = get_unique_path(files_dir / path.name)
+        # can only use hard links if source and destination are on the same device
+        use_hard_links = self.hard_links and os.stat(path).st_dev == os.stat(files_dir).st_dev
+        if path.is_file():
+            if use_hard_links:
+                os.link(path, dest_path)
+            else:
+                shutil.copy2(path, dest_path)
+        else:
+            copy_function = os.link if use_hard_links else shutil.copy2
+            # link directory contents recursively
+            shutil.copytree(
+                path,
+                dest_path,
+                dirs_exist_ok=True,
+                copy_function=copy_function,
+                ignore=shutil.ignore_patterns(*self.ignore_patterns)
+            )
diff --git a/src/nabit/lib/capture.py b/src/nabit/lib/backends/url.py
similarity index 73%
rename from src/nabit/lib/capture.py
rename to src/nabit/lib/backends/url.py
index 542d0da..b07ea5d 100644
--- a/src/nabit/lib/capture.py
+++ b/src/nabit/lib/backends/url.py
@@ -4,9 +4,12 @@
 from urllib.parse import urlparse
 import mimetypes
 from pathlib import Path
-import requests  # requests must be imported after capture_http
+import requests
 import os
-from nabit.lib.utils import get_unique_path
+from dataclasses import dataclass
+from ..utils import get_unique_path
+from .base import CollectionTask
+
 """
 This file handles capturing of URLs and request/response metadata.
 We use an unpacked WARC format to make it easier to access underlying data files.
@@ -29,11 +32,36 @@
   even if the original response was gzip encoded in transit.
 """
 
+@dataclass
+class UrlCollectionTask(CollectionTask):
+    """Collect URLs and request/response metadata."""
+    url: str
+    output: Path | None = None
+
+    def __post_init__(self):
+        """Validate the URL by attempting to prepare a request."""
+        requests.Request('GET', self.url).prepare()
+
+    def collect(self, files_dir: Path) -> None:
+        """
+        Capture URL to a WARC file using our custom FileWriter.
+        Appends to the WARC file if it already exists.
+        """
+        warc_path = files_dir.parent / 'headers.warc'
+        with open(warc_path, 'ab') as fh:
+            warc_writer = FileWriter(fh, warc_path, gzip=False)
+            with capture_http(warc_writer):
+                warc_writer.custom_out_path = self.output
+                requests.get(self.url)
+
+
 class FileWriter(WARCWriter):
     """
     A WARC writer that stores response bodies uncompressed in the files/ directory.
     """
     revisit_status_codes = set(['200', '203'])
+    custom_out_path = None  # override output path
+
     def __init__(self, filebuf, warc_path: Path, *args, **kwargs):
         super(WARCWriter, self).__init__(*args, **kwargs)
         self.out = filebuf
@@ -49,19 +77,23 @@ def _write_warc_record(self, out, record):
             headers.replace_header('WARC-Type', 'revisit')
             
             ## get a filename for the response body
-            uri = headers.get_header('WARC-Target-URI')
-            parsed_url = urlparse(uri)
-            filename = Path(parsed_url.path.split('/')[-1])
-            # set stem
-            stem = filename.stem.lstrip('.') or 'data'
-            # set extension
-            extension = filename.suffix
-            if not extension:
-                if content_type := record.http_headers.get_header('Content-Type'):  # pragma: no branch
-                    extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False)
-                if not extension:  
-                    extension = '.unknown'  # pragma: no cover
-            out_path = get_unique_path(self.files_path / f'{stem}{extension}')
+            if self.custom_out_path is not None:
+                out_path = self.custom_out_path
+            else:
+                uri = headers.get_header('WARC-Target-URI')
+                parsed_url = urlparse(uri)
+                filename = Path(parsed_url.path.split('/')[-1])
+                # set stem
+                stem = filename.stem.lstrip('.') or 'data'
+                # set extension
+                extension = filename.suffix
+                if not extension:
+                    if content_type := record.http_headers.get_header('Content-Type'):  # pragma: no branch
+                        extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False)
+                    if not extension:  
+                        extension = '.unknown'  # pragma: no cover
+                out_path = f'{stem}{extension}'
+            out_path = get_unique_path(self.files_path / out_path)
             relative_path = out_path.relative_to(self.warc_path.parent)
 
             # add our custom WARC-Profile header
@@ -90,19 +122,6 @@ def _write_warc_record(self, out, record):
 
         return super()._write_warc_record(out, record)
 
-
-def capture(urls: list[str], warc_path: Path, request_kwargs: dict = {}) -> None:
-    """
-    Capture a list of URLs to a WARC file using our custom FileWriter.
-    Appends to the WARC file if it already exists.
-    """
-    use_gzip = str(warc_path).endswith('.gz')
-    with open(warc_path, 'ab') as fh:
-        warc_writer = FileWriter(fh, warc_path, gzip=use_gzip)
-        with capture_http(warc_writer):
-            for url in urls:
-                requests.get(url, **request_kwargs)
-
 def validate_warc_headers(headers_path: Path, error, warn, success) -> None:
     """
     Validate a headers.warc file created by capture().
diff --git a/tests/backends/__init__.py b/tests/backends/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_capture.py b/tests/backends/test_path_backend.py
similarity index 90%
rename from tests/test_capture.py
rename to tests/backends/test_path_backend.py
index ba28734..2ecd01d 100644
--- a/tests/test_capture.py
+++ b/tests/backends/test_path_backend.py
@@ -1,7 +1,6 @@
 import pytest
-from pathlib import Path
 from warcio.archiveiterator import ArchiveIterator
-from nabit.lib.capture import capture
+from nabit.lib.backends.url import UrlCollectionTask
 
 
 @pytest.fixture
@@ -19,7 +18,7 @@ def capture_dir(tmp_path):
 def test_capture_with_content(capture_dir, server):
     """Test capturing a 200 response with body content"""
     
-    capture([server.url_for("/test.txt")], capture_dir["headers_path"])
+    UrlCollectionTask(url=server.url_for("/test.txt")).collect(capture_dir["files_dir"])
 
     # Check headers.warc
     with open(capture_dir["headers_path"], 'rb') as fh:
@@ -41,7 +40,7 @@ def test_capture_empty_response(capture_dir, server):
     # Add empty response to server
     server.expect_request("/empty").respond_with_data("")
     
-    capture([server.url_for("/empty")], capture_dir["headers_path"])
+    UrlCollectionTask(url=server.url_for("/empty")).collect(capture_dir["headers_path"])
 
     # Check headers.warc - should be a response record, not revisit
     with open(capture_dir["headers_path"], 'rb') as fh:
@@ -65,7 +64,7 @@ def test_capture_redirect(capture_dir, server):
         headers={"Location": target_url}
     )
 
-    capture([redirect_url], capture_dir["headers_path"])
+    UrlCollectionTask(url=redirect_url).collect(capture_dir["headers_path"])
 
     # Check headers.warc
     with open(capture_dir["headers_path"], 'rb') as fh:
diff --git a/tests/backends/test_url_backend.py b/tests/backends/test_url_backend.py
new file mode 100644
index 0000000..c1e0cc8
--- /dev/null
+++ b/tests/backends/test_url_backend.py
@@ -0,0 +1,21 @@
+from nabit.lib.backends.path import PathCollectionTask
+
+
+def test_ds_store_ignored(tmp_path):
+    """Test that files in ignore_patterns are ignored when copying directories"""
+    # Setup source directory
+    source_dir = tmp_path / "test_dir"
+    source_dir.mkdir()
+    (source_dir / ".DS_Store").write_text("ignored")
+    (source_dir / "test.txt").write_text("included")
+
+    # Setup destination directory
+    dest_dir = tmp_path / "output"
+    dest_dir.mkdir()
+
+    # Test copying
+    PathCollectionTask(path=str(source_dir)).collect(dest_dir)
+
+    # Verify results
+    assert not (dest_dir / "test_dir/.DS_Store").exists()
+    assert (dest_dir / "test_dir/test.txt").read_text() == "included"
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 9e173ae..500c5a1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,8 @@
 
 from nabit.lib.archive import package
 from nabit.lib.sign import KNOWN_TSAS
-
+from nabit.lib.backends.path import PathCollectionTask
+from nabit.lib.backends.url import UrlCollectionTask
 
 @pytest.fixture
 def test_files(tmp_path):
@@ -26,7 +27,10 @@ def test_bag(tmp_path, test_files):
     bag_path = tmp_path / "test_bag"
     package(
         output_path=bag_path,
-        paths=test_files["payload"],
+        collect=[
+            PathCollectionTask(path=str(test_files["payload"][0])),
+            PathCollectionTask(path=str(test_files["payload"][1]))
+        ],
         signed_metadata=test_files["signed_metadata"].read_text(),
         unsigned_metadata=test_files["unsigned_metadata"].read_text(),
         bag_info={"Source-Organization": "Test Org"}
@@ -40,7 +44,9 @@ def warc_bag(tmp_path, server):
     bag_path = tmp_path / "warc_bag"
     package(
         output_path=bag_path,
-        urls=[server.url_for("/")],
+        collect=[
+            UrlCollectionTask(url=server.url_for("/"))
+        ],
         bag_info={"Source-Organization": "Test Org"}
     )
     return bag_path
@@ -59,7 +65,10 @@ def signed_bag(tmp_path, test_files, root_ca):
     # TODO: don't call out to live TSA server
     package(
         output_path=bag_path,
-        paths=test_files["payload"],
+        collect=[
+            PathCollectionTask(path=str(test_files["payload"][0])),
+            PathCollectionTask(path=str(test_files["payload"][1]))
+        ],
         signed_metadata=test_files["signed_metadata"].read_text(),
         unsigned_metadata=test_files["unsigned_metadata"].read_text(),
         bag_info={"Source-Organization": "Test Org"},
diff --git a/tests/test_archive.py b/tests/test_archive.py
index baaaf26..f74e158 100644
--- a/tests/test_archive.py
+++ b/tests/test_archive.py
@@ -1,26 +1,7 @@
 import pytest
-from nabit.lib.archive import copy_paths, validate_package
+from nabit.lib.archive import validate_package
 
 
-def test_ds_store_ignored(tmp_path):
-    """Test that files in IGNORE_PATTERNS are ignored when copying directories"""
-    # Setup source directory
-    source_dir = tmp_path / "test_dir"
-    source_dir.mkdir()
-    (source_dir / ".DS_Store").write_text("ignored")
-    (source_dir / "test.txt").write_text("included")
-    
-    # Setup destination directory
-    dest_dir = tmp_path / "output"
-    dest_dir.mkdir()
-
-    # Test copying
-    copy_paths([source_dir], dest_dir)
-
-    # Verify results
-    assert not (dest_dir / "test_dir/.DS_Store").exists()
-    assert (dest_dir / "test_dir/test.txt").read_text() == "included"
-
 def test_validate_raises(tmp_path):
     # make sure that vanilla validate_package raises an error
     # unless there's an error callback that does something else
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 949ed76..51a6fe6 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -306,6 +306,45 @@ def test_duplicate_file_names(runner, tmp_path, server):
     files = sorted((p.name for p in (bag_path / "data" / "files").glob("data*.html")))
     assert re.match(r"data-[0-9a-zA-Z]{6}\.html;data-[0-9a-zA-Z]{6}\.html;data\.html", ";".join(files))
 
+def test_url_with_custom_output_path(runner, tmp_path, server):
+    """Test archiving a URL with a custom output path"""
+    bag_path = tmp_path / 'bag'
+    custom_output_path = 'custom_output.html'
+    
+    run(runner, [
+        'archive',
+        str(bag_path),
+        '-u', f'{{"url": "{server.url_for("/")}", "output": "{custom_output_path}"}}',
+        '-u', server.url_for("/another.html"),
+    ])
+    
+    # Verify that the file is saved with the custom output path
+    assert (bag_path / f'data/files/{custom_output_path}').read_text() == 'root content'
+    assert (bag_path / f'data/files/another.html').read_text() == 'another content'
+    assert validate_passing(bag_path) == snapshot("""\
+SUCCESS: headers.warc found
+SUCCESS: bag format is valid
+WARNING: No signatures found
+WARNING: No timestamps found\
+""")
+
+def test_collect_json(runner, tmp_path, server):
+    """Test successful parsing of --collect JSON"""
+    bag_path = tmp_path / 'bag'
+    collect_tasks = [
+        {"backend": "url", "url": server.url_for("/")},
+        {"backend": "url", "url": server.url_for("/another.html"), "output": "custom.html"}
+    ]
+    
+    run(runner, [
+        'archive',
+        str(bag_path),
+        '--collect', json.dumps(collect_tasks)
+    ])
+    
+    assert (bag_path / 'data/files/data.html').read_text() == 'root content'
+    assert (bag_path / 'data/files/custom.html').read_text() == 'another content'
+
 ## validation errors
 
 def test_invalid_metadata_file_extension(runner, tmp_path):
@@ -398,3 +437,26 @@ def test_empty_package(runner, tmp_path):
         'archive',
         str(tmp_path),
     ], exit_code=1, output='No files in data/files')
+
+def test_invalid_collect_json(runner, tmp_path):
+    """Test error handling for invalid --collect JSON"""
+    # Test invalid JSON syntax
+    run(runner, [
+        'archive',
+        str(tmp_path / 'bag'),
+        '--collect', '{invalid json}'
+    ], exit_code=2, output='Invalid JSON string for --collect')
+
+    # Test non-list JSON
+    run(runner, [
+        'archive',
+        str(tmp_path / 'bag'),
+        '--collect', '{"not": "a list"}'
+    ], exit_code=2, output='--collect must be a list of JSON objects')
+
+    # Test invalid task definition
+    run(runner, [
+        'archive',
+        str(tmp_path / 'bag'),
+        '--collect', '[{"backend": "invalid"}]'
+    ], exit_code=2, output='Invalid task definition for --collect')