Improved output filename handling

harvard-lil · Dec 9, 2024 · 16e1bd0 · 16e1bd0
1 parent a8e5be1
commit 16e1bd0
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 31 deletions.
diff --git a/src/nabit/lib/backends/url.py b/src/nabit/lib/backends/url.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 import requests
 import os
+import re
 from dataclasses import dataclass
 from ..utils import get_unique_path
 from .base import CollectionTask
@@ -42,6 +43,10 @@ class UrlCollectionTask(CollectionTask):
 
     timeout: float = 5.0
 
+    # if content type matches this string, use the given extension
+    # instead of the extension in the URL path
+    content_type_overrides = {"text/html": ".html"}
+
     def __post_init__(self):
         """Validate the URL by attempting to prepare a request."""
         requests.Request('GET', self.url).prepare()
@@ -76,12 +81,13 @@ class FileWriter(WARCWriter):
     custom_out_path = None  # override output path
     result_path = None
 
-    def __init__(self, filebuf, warc_path: Path, *args, **kwargs):
+    def __init__(self, filebuf, warc_path: Path, content_type_overrides: dict[str, str] = {}, *args, **kwargs):
         super(WARCWriter, self).__init__(*args, **kwargs)
         self.out = filebuf
         self.warc_path = Path(warc_path)
         self.files_path = self.warc_path.parent / 'files'
         self.files_path.mkdir(exist_ok=True)
+        self.content_type_overrides = content_type_overrides
 
     def _write_warc_record(self, out, record):
         if record.rec_type == 'response' and record.http_headers and record.http_headers.get_statuscode() in self.revisit_status_codes:
@@ -92,22 +98,14 @@ def _write_warc_record(self, out, record):
 
             ## get a filename for the response body
             if self.custom_out_path is not None:
-                out_path = self.custom_out_path
+                out_name = self.custom_out_path
             else:
-                uri = headers.get_header('WARC-Target-URI')
-                parsed_url = urlparse(uri)
-                filename = Path(parsed_url.path.split('/')[-1])
-                # set stem
-                stem = filename.stem.lstrip('.') or 'data'
-                # set extension
-                extension = filename.suffix
-                if not extension:
-                    if content_type := record.http_headers.get_header('Content-Type'):  # pragma: no branch
-                        extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False)
-                    if not extension:  
-                        extension = '.unknown'  # pragma: no cover
-                out_path = f'{stem}{extension}'
-            out_path = get_unique_path(self.files_path / out_path)
+                out_name = url_to_filename(
+                    headers.get_header('WARC-Target-URI'),
+                    record.http_headers.get_header('Content-Type'),
+                    self.content_type_overrides
+                )
+            out_path = get_unique_path(self.files_path / out_name)
             relative_path = out_path.relative_to(self.warc_path.parent)
             self.result_path = out_path.relative_to(self.files_path)
 
@@ -176,3 +174,57 @@ def validate_warc_headers(headers_path: Path, error, warn, success) -> None:
             if file not in headers_files:
                 warn(f"Some files in data/files are not specified in headers.warc. Example: {file}")
                 break
+
+def url_to_filename(url: str, content_type: str | None = None, content_type_overrides: dict[str, str] = {}) -> str:
+    """
+    Convert a URL to a filename based on the URL path and content type.
+
+    >>> url_to_filename('https://example.com/path/to/file.pdf')
+    'file.pdf'
+    >>> url_to_filename('https://sub.example.com/', 'text/html')
+    'sub_example_com.html'
+    >>> url_to_filename('https://example.com/foo', 'fake/content-type')
+    'foo.unknown'
+    >>> url_to_filename('https://example.com/page', 'text/html')
+    'page.html'
+    >>> url_to_filename('https://example.com/image', 'image/jpeg')
+    'image.jpg'
+    >>> url_to_filename('https://example.com/path/', 'text/plain')
+    'path.txt'
+    >>> url_to_filename('https://example.com/page.php', 'text/html')
+    'page.php'
+    >>> url_to_filename('https://example.com/page.php', 'text/html; charset=utf-8', {'text/html': '.html'})
+    'page.html'
+    >>> url_to_filename('https://example.com/page.html?foo=bar')
+    'page.html'
+    >>> url_to_filename('https://127.0.0.1:8080/', 'text/html')
+    '127_0_0_1.html'
+    >>> url_to_filename('https://example.com/..', 'text/html')
+    'dot_.html'
+    >>> url_to_filename('https://example.com/.html', 'text/html')
+    'dot_html.html'
+    """
+    parsed_url = urlparse(url)
+    if parsed_url.path.strip('/'):
+        filename = parsed_url.path.rstrip('/').split('/')[-1]
+    else:
+        filename = parsed_url.hostname.replace('.', '_')
+    filename = Path(filename)
+
+    # set stem
+    stem = filename.stem
+    stem = re.sub(r'^\.+', 'dot_', stem)
+
+    # set extension
+    extension = None
+    content_type = (content_type or '').split(';')[0]  # strip content types like "text/html; charset=utf-8"
+    if content_type in content_type_overrides:
+        extension = content_type_overrides[content_type]
+    elif filename.suffix:
+        extension = filename.suffix
+    elif content_type:
+        extension = mimetypes.guess_extension(content_type, strict=False)
+    if not extension:
+        extension = '.unknown'
+
+    return f'{stem}{extension}'
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -12,7 +12,7 @@
 
 def run(runner, args, exit_code=0, output="Package created"):
     result = runner.invoke(main, args, catch_exceptions=False)
-    assert result.exit_code == exit_code
+    assert result.exit_code == exit_code, f"Expected exit code {exit_code}, got {result.exit_code} with output: {result.output}"
     if output:
         assert output in result.output
     return result
@@ -77,7 +77,7 @@ def test_url_payload(runner, tmp_path, server):
 WARNING: No signatures found
 WARNING: No timestamps found\
 """)
-    assert (bag_path / 'data/files/data.html').read_text() == 'root content' 
+    assert (bag_path / 'data/files/localhost.html').read_text() == 'root content' 
     assert (bag_path / 'data/files/another.html').read_text() == 'another content'
     assert (bag_path / 'data/files/test.txt').read_text() == 'test content'
 
@@ -287,19 +287,17 @@ def test_duplicate_file_names(runner, tmp_path, server):
     bag_path = tmp_path / 'bag'
 
     # Create two files with the same name in different directories
-    dir1 = tmp_path / "dir1"
-    dir2 = tmp_path / "dir2"
-    dir1.mkdir()
-    dir2.mkdir()
-    (dir1 / "data.html").write_text("content1")
-    (dir2 / "data.html").write_text("content2")
+    (tmp_path / "data.html").write_text("content1")
+    (tmp_path / "data2.html").write_text("content2")
+    server.expect_request("/data.html").respond_with_data("content3", content_type="text/html")
+
 
     run(runner, [
         'archive',
         str(bag_path),
-        '-p', str(dir1 / "data.html"),
-        '-p', str(dir2 / "data.html"),
-        '-u', server.url_for("/"),
+        '-p', str(tmp_path / "data.html"),
+        '-p', f'{{"path": "{tmp_path / "data2.html"}", "output": "data.html"}}',
+        '-u', server.url_for("/data.html"),
     ])
 
     # Verify files exist; one will be data.html and the others will be data-a1b2c3.html
@@ -342,7 +340,7 @@ def test_collect_json(runner, tmp_path, server):
         '--collect', json.dumps(collect_tasks)
     ])
 
-    assert (bag_path / 'data/files/data.html').read_text() == 'root content'
+    assert (bag_path / 'data/files/localhost.html').read_text() == 'root content'
     assert (bag_path / 'data/files/custom.html').read_text() == 'another content'
 
 def test_empty_package(runner, tmp_path):

diff --git a/tests/test_validation.py b/tests/test_validation.py
@@ -51,12 +51,12 @@ def test_extra_payload(test_bag):
 """)
 
 def test_missing_warc_file(warc_bag):
-    (warc_bag / "data/files/data.html").unlink()
+    (warc_bag / "data/files/localhost.html").unlink()
     assert validate_failing(warc_bag) == snapshot("""\
 SUCCESS: headers.warc found
-ERROR: headers.warc specifies files that do not exist in data/files. Example: files/data.html
+ERROR: headers.warc specifies files that do not exist in data/files. Example: files/localhost.html
 WARNING: No files in data/files
-ERROR: bag format is invalid: Bag validation failed: data/files/data.html exists in manifest but was not found on filesystem
+ERROR: bag format is invalid: Bag validation failed: data/files/localhost.html exists in manifest but was not found on filesystem
 WARNING: Cannot verify the validity of empty directories: <bag_path>/data/files
 WARNING: No signatures found
 WARNING: No timestamps found\