From d9d25cc366b5ae2e12651179ea47d83948750e73 Mon Sep 17 00:00:00 2001 From: Brian Helba Date: Wed, 13 Dec 2023 15:08:58 -0500 Subject: [PATCH] Don't use filesystem APIs to manipulate URLs Following #1743, this removes the remaining uses of the filesystem APIs `os.path` and `pathlib` (which are platform-dependent and contain methods for local filesystem manipulation) to process URLs. The `zarr_checksum` package still has this confusion in its `ZarrArchiveFile` class, but attempts to refactor some of that code failed due to other issues with it. So, the `ZarrArchiveFile.path` field is just cast to a `str` when interfacing with local application code. --- dandiapi/analytics/tasks/__init__.py | 5 ++--- dandiapi/api/tests/test_asset.py | 9 ++++----- dandiapi/api/tests/test_upload.py | 3 +-- dandiapi/api/views/asset.py | 4 +--- dandiapi/zarr/models.py | 5 ++--- dandiapi/zarr/tests/test_zarr.py | 6 +++--- dandiapi/zarr/views/__init__.py | 10 ++++------ 7 files changed, 17 insertions(+), 25 deletions(-) diff --git a/dandiapi/analytics/tasks/__init__.py b/dandiapi/analytics/tasks/__init__.py index 2d1f6d94a8..e68e35949f 100644 --- a/dandiapi/analytics/tasks/__init__.py +++ b/dandiapi/analytics/tasks/__init__.py @@ -1,6 +1,5 @@ from collections import Counter from collections.abc import Generator -from pathlib import Path from celery.app import shared_task from celery.utils.log import get_task_logger @@ -70,7 +69,7 @@ def process_s3_log_file_task(bucket: LogBucket, s3_log_key: str) -> None: # short circuit if the log file has already been processed. note that this doesn't guarantee # exactly once processing, that's what the unique constraint on ProcessedS3Log is for. - if ProcessedS3Log.objects.filter(name=Path(s3_log_key).name, embargoed=embargoed).exists(): + if ProcessedS3Log.objects.filter(name=s3_log_key.split('/')[-1], embargoed=embargoed).exists(): return s3 = get_boto_client(get_storage() if not embargoed else get_embargo_storage()) @@ -86,7 +85,7 @@ def process_s3_log_file_task(bucket: LogBucket, s3_log_key: str) -> None: with transaction.atomic(): try: - log = ProcessedS3Log(name=Path(s3_log_key).name, embargoed=embargoed) + log = ProcessedS3Log(name=s3_log_key.split('/')[-1], embargoed=embargoed) # disable constraint validation checking so duplicate errors can be detected and # ignored. the rest of the full_clean errors should still be raised. log.full_clean(validate_constraints=False) diff --git a/dandiapi/api/tests/test_asset.py b/dandiapi/api/tests/test_asset.py index 23fda2f4ae..2474561815 100644 --- a/dandiapi/api/tests/test_asset.py +++ b/dandiapi/api/tests/test_asset.py @@ -1,5 +1,4 @@ import json -import os.path from uuid import uuid4 from django.conf import settings @@ -1349,7 +1348,7 @@ def test_asset_download(api_client, storage, version, asset): download = requests.get(download_url, timeout=5) cd_header = download.headers.get('Content-Disposition') - assert cd_header == f'attachment; filename="{os.path.basename(asset.path)}"' + assert cd_header == f'attachment; filename="{asset.path.split("/")[-1]}"' with asset.blob.blob.file.open('rb') as reader: assert download.content == reader.read() @@ -1395,7 +1394,7 @@ def test_asset_download_embargo( download = requests.get(download_url, timeout=5) cd_header = download.headers.get('Content-Disposition') - assert cd_header == f'attachment; filename="{os.path.basename(asset.path)}"' + assert cd_header == f'attachment; filename="{asset.path.split("/")[-1]}"' with asset.embargoed_blob.blob.file.open('rb') as reader: assert download.content == reader.read() @@ -1430,7 +1429,7 @@ def test_asset_direct_download(api_client, storage, version, asset): download = requests.get(download_url, timeout=5) cd_header = download.headers.get('Content-Disposition') - assert cd_header == f'attachment; filename="{os.path.basename(asset.path)}"' + assert cd_header == f'attachment; filename="{asset.path.split("/")[-1]}"' with asset.blob.blob.file.open('rb') as reader: assert download.content == reader.read() @@ -1462,7 +1461,7 @@ def test_asset_direct_download_head(api_client, storage, version, asset): download = requests.get(download_url, timeout=5) cd_header = download.headers.get('Content-Disposition') - assert cd_header == f'attachment; filename="{os.path.basename(asset.path)}"' + assert cd_header == f'attachment; filename="{asset.path.split("/")[-1]}"' with asset.blob.blob.file.open('rb') as reader: assert download.content == reader.read() diff --git a/dandiapi/api/tests/test_upload.py b/dandiapi/api/tests/test_upload.py index 764b0ff1b9..15e5a4d918 100644 --- a/dandiapi/api/tests/test_upload.py +++ b/dandiapi/api/tests/test_upload.py @@ -1,4 +1,3 @@ -import os import uuid from django.core.files.base import ContentFile @@ -536,7 +535,7 @@ def test_upload_validate_wrong_size(api_client, user, upload): api_client.force_authenticate(user=user) wrong_content = b'not 100 bytes' - upload.blob.save(os.path.basename(upload.blob.name), ContentFile(wrong_content)) + upload.blob.save(upload.blob.name.split('/')[-1], ContentFile(wrong_content)) resp = api_client.post(f'/api/uploads/{upload.upload_id}/validate/') assert resp.status_code == 400 diff --git a/dandiapi/api/views/asset.py b/dandiapi/api/views/asset.py index aed2815b27..684ada27b0 100644 --- a/dandiapi/api/views/asset.py +++ b/dandiapi/api/views/asset.py @@ -22,8 +22,6 @@ # This should only be used for type interrogation, never instantiation MinioStorage = type('FakeMinioStorage', (), {}) -import os.path - from django.conf import settings from django.db import transaction from django.db.models import QuerySet @@ -143,7 +141,7 @@ def download(self, request, *args, **kwargs): serializer.is_valid(raise_exception=True) content_disposition = serializer.validated_data['content_disposition'] content_type = asset.metadata.get('encodingFormat', 'application/octet-stream') - asset_basename = os.path.basename(asset.path) + asset_basename = asset.path.split('/')[-1] if content_disposition == 'attachment': return HttpResponseRedirect( diff --git a/dandiapi/zarr/models.py b/dandiapi/zarr/models.py index 33a0dfdec3..3546b6213a 100644 --- a/dandiapi/zarr/models.py +++ b/dandiapi/zarr/models.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging -from pathlib import Path from urllib.parse import urlparse, urlunparse from uuid import uuid4 @@ -100,7 +99,7 @@ class ZarrArchive(BaseZarrArchive): storage = get_storage() dandiset = models.ForeignKey(Dandiset, related_name='zarr_archives', on_delete=models.CASCADE) - def s3_path(self, zarr_path: str | Path): + def s3_path(self, zarr_path: str) -> str: """Generate a full S3 object path from a path in this zarr_archive.""" return f'{settings.DANDI_DANDISETS_BUCKET_PREFIX}{settings.DANDI_ZARR_PREFIX_NAME}/{self.zarr_id}/{zarr_path}' # noqa: E501 @@ -111,6 +110,6 @@ class EmbargoedZarrArchive(BaseZarrArchive): Dandiset, related_name='embargoed_zarr_archives', on_delete=models.CASCADE ) - def s3_path(self, zarr_path: str | Path): + def s3_path(self, zarr_path: str) -> str: """Generate a full S3 object path from a path in this zarr_archive.""" return f'{settings.DANDI_DANDISETS_EMBARGO_BUCKET_PREFIX}{settings.DANDI_ZARR_PREFIX_NAME}/{self.dandiset.identifier}/{self.zarr_id}/{zarr_path}' # noqa: E501 diff --git a/dandiapi/zarr/tests/test_zarr.py b/dandiapi/zarr/tests/test_zarr.py index e568d7c19e..8c857d1adf 100644 --- a/dandiapi/zarr/tests/test_zarr.py +++ b/dandiapi/zarr/tests/test_zarr.py @@ -235,7 +235,7 @@ def test_zarr_rest_delete_file( f'/api/zarr/{zarr_archive.zarr_id}/files/', [{'path': str(zarr_file.path)}] ) assert resp.status_code == 204 - assert not zarr_archive.storage.exists(zarr_archive.s3_path(zarr_file.path)) + assert not zarr_archive.storage.exists(zarr_archive.s3_path(str(zarr_file.path))) # Assert zarr is back in pending state zarr_archive.refresh_from_db() @@ -334,7 +334,7 @@ def test_zarr_rest_delete_multiple_files( # Assert not found for file in zarr_files: - assert not zarr_archive.storage.exists(zarr_archive.s3_path(file)) + assert not zarr_archive.storage.exists(zarr_archive.s3_path(str(file.path))) ingest_zarr_archive(zarr_archive.zarr_id) zarr_archive.refresh_from_db() @@ -367,7 +367,7 @@ def test_zarr_rest_delete_missing_file( assert resp.json() == [ f'File test-prefix/test-zarr/{zarr_archive.zarr_id}/does/not/exist does not exist.' ] - assert zarr_archive.storage.exists(zarr_archive.s3_path(zarr_file.path)) + assert zarr_archive.storage.exists(zarr_archive.s3_path(str(zarr_file.path))) # Ingest zarr_archive.status = ZarrArchiveStatus.UPLOADED diff --git a/dandiapi/zarr/views/__init__.py b/dandiapi/zarr/views/__init__.py index 7d780e0669..f5df4ecdb1 100644 --- a/dandiapi/zarr/views/__init__.py +++ b/dandiapi/zarr/views/__init__.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging -from pathlib import Path from django.db import IntegrityError, transaction from django.db.models.query import QuerySet @@ -193,17 +192,16 @@ def files(self, request, zarr_id: str): serializer.is_valid(raise_exception=True) # The root path for this zarr in s3 - base_path = Path(zarr_archive.s3_path('')) + base_path = zarr_archive.s3_path('') # Retrieve and join query params limit = serializer.validated_data['limit'] download = serializer.validated_data['download'] raw_prefix = serializer.validated_data['prefix'] - full_prefix = str(base_path / raw_prefix) + full_prefix = f'{base_path}/{raw_prefix}' - _after = serializer.validated_data['after'] - after = str(base_path / _after) if _after else '' + after = f'{base_path}/{serializer.validated_data["after"]}' # Handle head request redirects if request.method == 'HEAD': @@ -229,7 +227,7 @@ def files(self, request, zarr_id: str): # Map/filter listing results = [ { - 'Key': str(Path(obj['Key']).relative_to(base_path)), + 'Key': obj['Key'].removeprefix(base_path), 'LastModified': obj['LastModified'], 'ETag': obj['ETag'].strip('"'), 'Size': obj['Size'],