From 16e1bd0b4662ab4e5f891319332a932e15b73dee Mon Sep 17 00:00:00 2001 From: Jack Cushman Date: Mon, 9 Dec 2024 15:09:44 -0500 Subject: [PATCH] Improved output filename handling --- src/nabit/lib/backends/url.py | 84 ++++++++++++++++++++++++++++------- tests/test_cli.py | 22 +++++---- tests/test_validation.py | 6 +-- 3 files changed, 81 insertions(+), 31 deletions(-) diff --git a/src/nabit/lib/backends/url.py b/src/nabit/lib/backends/url.py index df8dfc4..f6d00e8 100644 --- a/src/nabit/lib/backends/url.py +++ b/src/nabit/lib/backends/url.py @@ -6,6 +6,7 @@ from pathlib import Path import requests import os +import re from dataclasses import dataclass from ..utils import get_unique_path from .base import CollectionTask @@ -42,6 +43,10 @@ class UrlCollectionTask(CollectionTask): timeout: float = 5.0 + # if content type matches this string, use the given extension + # instead of the extension in the URL path + content_type_overrides = {"text/html": ".html"} + def __post_init__(self): """Validate the URL by attempting to prepare a request.""" requests.Request('GET', self.url).prepare() @@ -76,12 +81,13 @@ class FileWriter(WARCWriter): custom_out_path = None # override output path result_path = None - def __init__(self, filebuf, warc_path: Path, *args, **kwargs): + def __init__(self, filebuf, warc_path: Path, content_type_overrides: dict[str, str] = {}, *args, **kwargs): super(WARCWriter, self).__init__(*args, **kwargs) self.out = filebuf self.warc_path = Path(warc_path) self.files_path = self.warc_path.parent / 'files' self.files_path.mkdir(exist_ok=True) + self.content_type_overrides = content_type_overrides def _write_warc_record(self, out, record): if record.rec_type == 'response' and record.http_headers and record.http_headers.get_statuscode() in self.revisit_status_codes: @@ -92,22 +98,14 @@ def _write_warc_record(self, out, record): ## get a filename for the response body if self.custom_out_path is not None: - out_path = self.custom_out_path + out_name = self.custom_out_path else: - uri = headers.get_header('WARC-Target-URI') - parsed_url = urlparse(uri) - filename = Path(parsed_url.path.split('/')[-1]) - # set stem - stem = filename.stem.lstrip('.') or 'data' - # set extension - extension = filename.suffix - if not extension: - if content_type := record.http_headers.get_header('Content-Type'): # pragma: no branch - extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False) - if not extension: - extension = '.unknown' # pragma: no cover - out_path = f'{stem}{extension}' - out_path = get_unique_path(self.files_path / out_path) + out_name = url_to_filename( + headers.get_header('WARC-Target-URI'), + record.http_headers.get_header('Content-Type'), + self.content_type_overrides + ) + out_path = get_unique_path(self.files_path / out_name) relative_path = out_path.relative_to(self.warc_path.parent) self.result_path = out_path.relative_to(self.files_path) @@ -176,3 +174,57 @@ def validate_warc_headers(headers_path: Path, error, warn, success) -> None: if file not in headers_files: warn(f"Some files in data/files are not specified in headers.warc. Example: {file}") break + +def url_to_filename(url: str, content_type: str | None = None, content_type_overrides: dict[str, str] = {}) -> str: + """ + Convert a URL to a filename based on the URL path and content type. + + >>> url_to_filename('https://example.com/path/to/file.pdf') + 'file.pdf' + >>> url_to_filename('https://sub.example.com/', 'text/html') + 'sub_example_com.html' + >>> url_to_filename('https://example.com/foo', 'fake/content-type') + 'foo.unknown' + >>> url_to_filename('https://example.com/page', 'text/html') + 'page.html' + >>> url_to_filename('https://example.com/image', 'image/jpeg') + 'image.jpg' + >>> url_to_filename('https://example.com/path/', 'text/plain') + 'path.txt' + >>> url_to_filename('https://example.com/page.php', 'text/html') + 'page.php' + >>> url_to_filename('https://example.com/page.php', 'text/html; charset=utf-8', {'text/html': '.html'}) + 'page.html' + >>> url_to_filename('https://example.com/page.html?foo=bar') + 'page.html' + >>> url_to_filename('https://127.0.0.1:8080/', 'text/html') + '127_0_0_1.html' + >>> url_to_filename('https://example.com/..', 'text/html') + 'dot_.html' + >>> url_to_filename('https://example.com/.html', 'text/html') + 'dot_html.html' + """ + parsed_url = urlparse(url) + if parsed_url.path.strip('/'): + filename = parsed_url.path.rstrip('/').split('/')[-1] + else: + filename = parsed_url.hostname.replace('.', '_') + filename = Path(filename) + + # set stem + stem = filename.stem + stem = re.sub(r'^\.+', 'dot_', stem) + + # set extension + extension = None + content_type = (content_type or '').split(';')[0] # strip content types like "text/html; charset=utf-8" + if content_type in content_type_overrides: + extension = content_type_overrides[content_type] + elif filename.suffix: + extension = filename.suffix + elif content_type: + extension = mimetypes.guess_extension(content_type, strict=False) + if not extension: + extension = '.unknown' + + return f'{stem}{extension}' \ No newline at end of file diff --git a/tests/test_cli.py b/tests/test_cli.py index 37ddb05..e999cd3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,7 +12,7 @@ def run(runner, args, exit_code=0, output="Package created"): result = runner.invoke(main, args, catch_exceptions=False) - assert result.exit_code == exit_code + assert result.exit_code == exit_code, f"Expected exit code {exit_code}, got {result.exit_code} with output: {result.output}" if output: assert output in result.output return result @@ -77,7 +77,7 @@ def test_url_payload(runner, tmp_path, server): WARNING: No signatures found WARNING: No timestamps found\ """) - assert (bag_path / 'data/files/data.html').read_text() == 'root content' + assert (bag_path / 'data/files/localhost.html').read_text() == 'root content' assert (bag_path / 'data/files/another.html').read_text() == 'another content' assert (bag_path / 'data/files/test.txt').read_text() == 'test content' @@ -287,19 +287,17 @@ def test_duplicate_file_names(runner, tmp_path, server): bag_path = tmp_path / 'bag' # Create two files with the same name in different directories - dir1 = tmp_path / "dir1" - dir2 = tmp_path / "dir2" - dir1.mkdir() - dir2.mkdir() - (dir1 / "data.html").write_text("content1") - (dir2 / "data.html").write_text("content2") + (tmp_path / "data.html").write_text("content1") + (tmp_path / "data2.html").write_text("content2") + server.expect_request("/data.html").respond_with_data("content3", content_type="text/html") + run(runner, [ 'archive', str(bag_path), - '-p', str(dir1 / "data.html"), - '-p', str(dir2 / "data.html"), - '-u', server.url_for("/"), + '-p', str(tmp_path / "data.html"), + '-p', f'{{"path": "{tmp_path / "data2.html"}", "output": "data.html"}}', + '-u', server.url_for("/data.html"), ]) # Verify files exist; one will be data.html and the others will be data-a1b2c3.html @@ -342,7 +340,7 @@ def test_collect_json(runner, tmp_path, server): '--collect', json.dumps(collect_tasks) ]) - assert (bag_path / 'data/files/data.html').read_text() == 'root content' + assert (bag_path / 'data/files/localhost.html').read_text() == 'root content' assert (bag_path / 'data/files/custom.html').read_text() == 'another content' def test_empty_package(runner, tmp_path): diff --git a/tests/test_validation.py b/tests/test_validation.py index 8ef31c1..546ab7b 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -51,12 +51,12 @@ def test_extra_payload(test_bag): """) def test_missing_warc_file(warc_bag): - (warc_bag / "data/files/data.html").unlink() + (warc_bag / "data/files/localhost.html").unlink() assert validate_failing(warc_bag) == snapshot("""\ SUCCESS: headers.warc found -ERROR: headers.warc specifies files that do not exist in data/files. Example: files/data.html +ERROR: headers.warc specifies files that do not exist in data/files. Example: files/localhost.html WARNING: No files in data/files -ERROR: bag format is invalid: Bag validation failed: data/files/data.html exists in manifest but was not found on filesystem +ERROR: bag format is invalid: Bag validation failed: data/files/localhost.html exists in manifest but was not found on filesystem WARNING: Cannot verify the validity of empty directories: /data/files WARNING: No signatures found WARNING: No timestamps found\