Skip to content

Commit

Permalink
Sanitize content hashes in archive methods
Browse files Browse the repository at this point in the history
Our content hashes are always hex digests and we should reject anything that contains non-hex characters.
  • Loading branch information
tillprochaska committed May 28, 2024
1 parent fc56a38 commit e37a8dd
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 12 deletions.
6 changes: 5 additions & 1 deletion servicelayer/archive/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from normality import safe_filename

from servicelayer.archive.archive import Archive
from servicelayer.archive.util import ensure_path, checksum, BUF_SIZE
from servicelayer.archive.util import ensure_path, checksum, sanitize_checksum, BUF_SIZE
from servicelayer.archive.util import path_prefix, path_content_hash

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -33,6 +33,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
"""Import the given file into the archive."""
if content_hash is None:
content_hash = checksum(file_path)
else:
content_hash = sanitize_checksum(content_hash)

if content_hash is None:
return
Expand All @@ -51,6 +53,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
return content_hash

def load_file(self, content_hash, file_name=None, temp_path=None):
content_hash = sanitize_checksum(content_hash)
return self._locate_key(content_hash)

def list_files(self, prefix=None):
Expand All @@ -67,6 +70,7 @@ def list_files(self, prefix=None):
yield path_content_hash(file_path)

def delete_file(self, content_hash):
content_hash = sanitize_checksum(content_hash)
prefix = path_prefix(content_hash)
if prefix is None:
return
Expand Down
6 changes: 5 additions & 1 deletion servicelayer/archive/gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from google.resumable_media.common import DataCorruption, InvalidResponse

from servicelayer.archive.virtual import VirtualArchive
from servicelayer.archive.util import checksum, ensure_path
from servicelayer.archive.util import checksum, sanitize_checksum, ensure_path
from servicelayer.archive.util import path_prefix, ensure_posix_path
from servicelayer.archive.util import path_content_hash, HASH_LENGTH
from servicelayer.util import service_retries, backoff
Expand Down Expand Up @@ -89,6 +89,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
file_path = ensure_path(file_path)
if content_hash is None:
content_hash = checksum(file_path)
else:
content_hash = sanitize_checksum(content_hash)

if content_hash is None:
return
Expand All @@ -111,6 +113,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
def load_file(self, content_hash, file_name=None, temp_path=None):
"""Retrieve a file from Google storage and put it onto the local file
system for further processing."""
content_hash = sanitize_checksum(content_hash)
for attempt in service_retries():
try:
blob = self._locate_contenthash(content_hash)
Expand Down Expand Up @@ -147,6 +150,7 @@ def delete_file(self, content_hash):
"""Check if a file with the given hash exists on S3."""
if content_hash is None or len(content_hash) < HASH_LENGTH:
return
content_hash = sanitize_checksum(content_hash)
prefix = path_prefix(content_hash)
if prefix is None:
return
Expand Down
6 changes: 5 additions & 1 deletion servicelayer/archive/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from servicelayer import settings
from servicelayer.archive.virtual import VirtualArchive
from servicelayer.archive.util import checksum, ensure_path
from servicelayer.archive.util import checksum, sanitize_checksum, ensure_path
from servicelayer.archive.util import path_prefix, path_content_hash

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -86,6 +86,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
file_path = ensure_path(file_path)
if content_hash is None:
content_hash = checksum(file_path)
else:
content_hash = sanitize_checksum(content_hash)

# if content_hash is None:
# return
Expand All @@ -105,6 +107,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
def load_file(self, content_hash, file_name=None, temp_path=None):
"""Retrieve a file from S3 storage and put it onto the local file
system for further processing."""
content_hash = sanitize_checksum(content_hash)
key = self._locate_key(content_hash)
if key is not None:
path = self._local_path(content_hash, file_name, temp_path)
Expand All @@ -114,6 +117,7 @@ def load_file(self, content_hash, file_name=None, temp_path=None):
def delete_file(self, content_hash):
if content_hash is None:
return
content_hash = sanitize_checksum(content_hash)
prefix = path_prefix(content_hash)
if prefix is None:
return
Expand Down
13 changes: 13 additions & 0 deletions servicelayer/archive/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import string
from hashlib import sha1
from pathlib import Path

Expand Down Expand Up @@ -32,6 +33,18 @@ def checksum(file_name):
return str(digest.hexdigest())


def sanitize_checksum(checksum):
"""Normalize the checksum. Raises an error if the given checksum invalid."""
if not checksum:
raise ValueError("Checksum is empty")

for char in checksum:
if char not in string.hexdigits:
raise ValueError(f'Checksum contains invalid character "{char}"')

return checksum


def path_prefix(content_hash):
"""Get a prefix for a content hashed folder structure."""
if content_hash is None:
Expand Down
3 changes: 2 additions & 1 deletion servicelayer/archive/virtual.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from normality import safe_filename

from servicelayer.archive.archive import Archive
from servicelayer.archive.util import ensure_path
from servicelayer.archive.util import ensure_path, sanitize_checksum

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -33,6 +33,7 @@ def cleanup_file(self, content_hash, temp_path=None):
"""Delete the local cached version of the file."""
if content_hash is None:
return
content_hash = sanitize_checksum(content_hash)
path = self._get_local_prefix(content_hash, temp_path=temp_path)
try:
shutil.rmtree(path, ignore_errors=True)
Expand Down
29 changes: 24 additions & 5 deletions tests/archive/test_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import shutil
import tempfile
from unittest import TestCase
Expand Down Expand Up @@ -26,9 +27,11 @@ def test_basic_archive(self):
assert out == out2, (out, out2)

def test_basic_archive_with_checksum(self):
checksum_ = "banana"
out = self.archive.archive_file(self.file, checksum_)
assert checksum_ == out, (checksum_, out)
with pytest.raises(ValueError):
self.archive.archive_file(self.file, content_hash="banana")

out = self.archive.archive_file(self.file, content_hash="01234567890abcdef")
assert out == "01234567890abcdef"

def test_generate_url(self):
out = self.archive.archive_file(self.file)
Expand All @@ -39,6 +42,15 @@ def test_publish(self):
assert not self.archive.can_publish

def test_load_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.load_file("banana")

# Valid content hash, but file does not exist
path = self.archive.load_file("01234567890abcdef")
assert path is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
path = self.archive.load_file(out)
assert path is not None, path
Expand All @@ -64,10 +76,17 @@ def test_list_files(self):
assert len(keys) == 0, keys

def test_delete_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.delete_file("banana")

# File does not exist
assert self.archive.delete_file("01234567890abcdef") is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
path = self.archive.load_file(out)
assert path is not None, path
self.archive.cleanup_file(out)
self.archive.delete_file(out)
assert self.archive.delete_file(out) is None
path = self.archive.load_file(out)
assert path is None, path
34 changes: 31 additions & 3 deletions tests/archive/test_s3.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from unittest import TestCase
from urllib.parse import urlparse, parse_qs

Expand Down Expand Up @@ -26,9 +27,11 @@ def test_basic_archive(self):
assert out == out2, (out, out2)

def test_basic_archive_with_checksum(self):
checksum_ = "banana"
out = self.archive.archive_file(self.file, checksum_)
assert checksum_ == out, (checksum_, out)
with pytest.raises(ValueError):
self.archive.archive_file(self.file, content_hash="banana")

out = self.archive.archive_file(self.file, content_hash="01234567890abcdef")
assert out == "01234567890abcdef"

def test_generate_url(self):
content_hash = self.archive.archive_file(self.file)
Expand Down Expand Up @@ -60,12 +63,29 @@ def test_publish_file(self):
assert "https://foo.s3.amazonaws.com/self.py" in url, url

def test_load_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.load_file("banana")

# Valid content hash, but file does not exist
path = self.archive.load_file("01234567890abcdef")
assert path is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
path = self.archive.load_file(out)
assert path is not None, path
assert path.is_file(), path

def test_cleanup_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.cleanup_file("banana")

# File does not exist
assert self.archive.cleanup_file("01234567890abcdef") is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
self.archive.cleanup_file(out)
path = self.archive.load_file(out)
Expand All @@ -86,6 +106,14 @@ def test_list_files(self):
assert len(keys) == 0, keys

def test_delete_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.delete_file("banana")

# File does not exist
assert self.archive.delete_file("01234567890abcdef") is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
path = self.archive.load_file(out)
assert path is not None, path
Expand Down
21 changes: 21 additions & 0 deletions tests/archive/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pytest
from unittest import TestCase

from servicelayer.archive.util import sanitize_checksum


class UtilTest(TestCase):
def test_sanitize_checksum(self):
assert sanitize_checksum("0123456789abcdef") == "0123456789abcdef"

with pytest.raises(ValueError, match="Checksum is empty"):
sanitize_checksum(None)

with pytest.raises(ValueError, match="Checksum is empty"):
sanitize_checksum("")

with pytest.raises(ValueError, match='Checksum contains invalid character "n"'):
sanitize_checksum("banana")

with pytest.raises(ValueError, match='Checksum contains invalid character "/"'):
sanitize_checksum("/")

0 comments on commit e37a8dd

Please sign in to comment.