Skip to content

Commit

Permalink
Improved output filename handling
Browse files Browse the repository at this point in the history
  • Loading branch information
jcushman committed Dec 9, 2024
1 parent a8e5be1 commit 16e1bd0
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 31 deletions.
84 changes: 68 additions & 16 deletions src/nabit/lib/backends/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path
import requests
import os
import re
from dataclasses import dataclass
from ..utils import get_unique_path
from .base import CollectionTask
Expand Down Expand Up @@ -42,6 +43,10 @@ class UrlCollectionTask(CollectionTask):

timeout: float = 5.0

# if content type matches this string, use the given extension
# instead of the extension in the URL path
content_type_overrides = {"text/html": ".html"}

def __post_init__(self):
"""Validate the URL by attempting to prepare a request."""
requests.Request('GET', self.url).prepare()
Expand Down Expand Up @@ -76,12 +81,13 @@ class FileWriter(WARCWriter):
custom_out_path = None # override output path
result_path = None

def __init__(self, filebuf, warc_path: Path, *args, **kwargs):
def __init__(self, filebuf, warc_path: Path, content_type_overrides: dict[str, str] = {}, *args, **kwargs):
super(WARCWriter, self).__init__(*args, **kwargs)
self.out = filebuf
self.warc_path = Path(warc_path)
self.files_path = self.warc_path.parent / 'files'
self.files_path.mkdir(exist_ok=True)
self.content_type_overrides = content_type_overrides

def _write_warc_record(self, out, record):
if record.rec_type == 'response' and record.http_headers and record.http_headers.get_statuscode() in self.revisit_status_codes:
Expand All @@ -92,22 +98,14 @@ def _write_warc_record(self, out, record):

## get a filename for the response body
if self.custom_out_path is not None:
out_path = self.custom_out_path
out_name = self.custom_out_path
else:
uri = headers.get_header('WARC-Target-URI')
parsed_url = urlparse(uri)
filename = Path(parsed_url.path.split('/')[-1])
# set stem
stem = filename.stem.lstrip('.') or 'data'
# set extension
extension = filename.suffix
if not extension:
if content_type := record.http_headers.get_header('Content-Type'): # pragma: no branch
extension = mimetypes.guess_extension(content_type.split(';')[0], strict=False)
if not extension:
extension = '.unknown' # pragma: no cover
out_path = f'{stem}{extension}'
out_path = get_unique_path(self.files_path / out_path)
out_name = url_to_filename(
headers.get_header('WARC-Target-URI'),
record.http_headers.get_header('Content-Type'),
self.content_type_overrides
)
out_path = get_unique_path(self.files_path / out_name)
relative_path = out_path.relative_to(self.warc_path.parent)
self.result_path = out_path.relative_to(self.files_path)

Expand Down Expand Up @@ -176,3 +174,57 @@ def validate_warc_headers(headers_path: Path, error, warn, success) -> None:
if file not in headers_files:
warn(f"Some files in data/files are not specified in headers.warc. Example: {file}")
break

def url_to_filename(url: str, content_type: str | None = None, content_type_overrides: dict[str, str] = {}) -> str:
"""
Convert a URL to a filename based on the URL path and content type.
>>> url_to_filename('https://example.com/path/to/file.pdf')
'file.pdf'
>>> url_to_filename('https://sub.example.com/', 'text/html')
'sub_example_com.html'
>>> url_to_filename('https://example.com/foo', 'fake/content-type')
'foo.unknown'
>>> url_to_filename('https://example.com/page', 'text/html')
'page.html'
>>> url_to_filename('https://example.com/image', 'image/jpeg')
'image.jpg'
>>> url_to_filename('https://example.com/path/', 'text/plain')
'path.txt'
>>> url_to_filename('https://example.com/page.php', 'text/html')
'page.php'
>>> url_to_filename('https://example.com/page.php', 'text/html; charset=utf-8', {'text/html': '.html'})
'page.html'
>>> url_to_filename('https://example.com/page.html?foo=bar')
'page.html'
>>> url_to_filename('https://127.0.0.1:8080/', 'text/html')
'127_0_0_1.html'
>>> url_to_filename('https://example.com/..', 'text/html')
'dot_.html'
>>> url_to_filename('https://example.com/.html', 'text/html')
'dot_html.html'
"""
parsed_url = urlparse(url)
if parsed_url.path.strip('/'):
filename = parsed_url.path.rstrip('/').split('/')[-1]
else:
filename = parsed_url.hostname.replace('.', '_')
filename = Path(filename)

# set stem
stem = filename.stem
stem = re.sub(r'^\.+', 'dot_', stem)

# set extension
extension = None
content_type = (content_type or '').split(';')[0] # strip content types like "text/html; charset=utf-8"
if content_type in content_type_overrides:
extension = content_type_overrides[content_type]
elif filename.suffix:
extension = filename.suffix
elif content_type:
extension = mimetypes.guess_extension(content_type, strict=False)
if not extension:
extension = '.unknown'

return f'{stem}{extension}'
22 changes: 10 additions & 12 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

def run(runner, args, exit_code=0, output="Package created"):
result = runner.invoke(main, args, catch_exceptions=False)
assert result.exit_code == exit_code
assert result.exit_code == exit_code, f"Expected exit code {exit_code}, got {result.exit_code} with output: {result.output}"
if output:
assert output in result.output
return result
Expand Down Expand Up @@ -77,7 +77,7 @@ def test_url_payload(runner, tmp_path, server):
WARNING: No signatures found
WARNING: No timestamps found\
""")
assert (bag_path / 'data/files/data.html').read_text() == 'root content'
assert (bag_path / 'data/files/localhost.html').read_text() == 'root content'
assert (bag_path / 'data/files/another.html').read_text() == 'another content'
assert (bag_path / 'data/files/test.txt').read_text() == 'test content'

Expand Down Expand Up @@ -287,19 +287,17 @@ def test_duplicate_file_names(runner, tmp_path, server):
bag_path = tmp_path / 'bag'

# Create two files with the same name in different directories
dir1 = tmp_path / "dir1"
dir2 = tmp_path / "dir2"
dir1.mkdir()
dir2.mkdir()
(dir1 / "data.html").write_text("content1")
(dir2 / "data.html").write_text("content2")
(tmp_path / "data.html").write_text("content1")
(tmp_path / "data2.html").write_text("content2")
server.expect_request("/data.html").respond_with_data("content3", content_type="text/html")


run(runner, [
'archive',
str(bag_path),
'-p', str(dir1 / "data.html"),
'-p', str(dir2 / "data.html"),
'-u', server.url_for("/"),
'-p', str(tmp_path / "data.html"),
'-p', f'{{"path": "{tmp_path / "data2.html"}", "output": "data.html"}}',
'-u', server.url_for("/data.html"),
])

# Verify files exist; one will be data.html and the others will be data-a1b2c3.html
Expand Down Expand Up @@ -342,7 +340,7 @@ def test_collect_json(runner, tmp_path, server):
'--collect', json.dumps(collect_tasks)
])

assert (bag_path / 'data/files/data.html').read_text() == 'root content'
assert (bag_path / 'data/files/localhost.html').read_text() == 'root content'
assert (bag_path / 'data/files/custom.html').read_text() == 'another content'

def test_empty_package(runner, tmp_path):
Expand Down
6 changes: 3 additions & 3 deletions tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def test_extra_payload(test_bag):
""")

def test_missing_warc_file(warc_bag):
(warc_bag / "data/files/data.html").unlink()
(warc_bag / "data/files/localhost.html").unlink()
assert validate_failing(warc_bag) == snapshot("""\
SUCCESS: headers.warc found
ERROR: headers.warc specifies files that do not exist in data/files. Example: files/data.html
ERROR: headers.warc specifies files that do not exist in data/files. Example: files/localhost.html
WARNING: No files in data/files
ERROR: bag format is invalid: Bag validation failed: data/files/data.html exists in manifest but was not found on filesystem
ERROR: bag format is invalid: Bag validation failed: data/files/localhost.html exists in manifest but was not found on filesystem
WARNING: Cannot verify the validity of empty directories: <bag_path>/data/files
WARNING: No signatures found
WARNING: No timestamps found\
Expand Down

0 comments on commit 16e1bd0

Please sign in to comment.