Skip to content

Commit

Permalink
Don't use filesystem APIs to manipulate URLs
Browse files Browse the repository at this point in the history
Following #1743, this removes the remaining uses of the filesystem APIs
`os.path` and `pathlib` (which are platform-dependent and contain methods for
local filesystem manipulation) to process URLs.

The `zarr_checksum` package still has this confusion in its `ZarrArchiveFile`
class, but attempts to refactor some of that code failed due to other issues
with it. So, the `ZarrArchiveFile.path` field is just cast to a `str` when
interfacing with local application code.
  • Loading branch information
brianhelba committed Dec 14, 2023
1 parent 09b80aa commit d9d25cc
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 25 deletions.
5 changes: 2 additions & 3 deletions dandiapi/analytics/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from collections import Counter
from collections.abc import Generator
from pathlib import Path

from celery.app import shared_task
from celery.utils.log import get_task_logger
Expand Down Expand Up @@ -70,7 +69,7 @@ def process_s3_log_file_task(bucket: LogBucket, s3_log_key: str) -> None:

# short circuit if the log file has already been processed. note that this doesn't guarantee
# exactly once processing, that's what the unique constraint on ProcessedS3Log is for.
if ProcessedS3Log.objects.filter(name=Path(s3_log_key).name, embargoed=embargoed).exists():
if ProcessedS3Log.objects.filter(name=s3_log_key.split('/')[-1], embargoed=embargoed).exists():
return

s3 = get_boto_client(get_storage() if not embargoed else get_embargo_storage())
Expand All @@ -86,7 +85,7 @@ def process_s3_log_file_task(bucket: LogBucket, s3_log_key: str) -> None:

with transaction.atomic():
try:
log = ProcessedS3Log(name=Path(s3_log_key).name, embargoed=embargoed)
log = ProcessedS3Log(name=s3_log_key.split('/')[-1], embargoed=embargoed)
# disable constraint validation checking so duplicate errors can be detected and
# ignored. the rest of the full_clean errors should still be raised.
log.full_clean(validate_constraints=False)
Expand Down
9 changes: 4 additions & 5 deletions dandiapi/api/tests/test_asset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import os.path
from uuid import uuid4

from django.conf import settings
Expand Down Expand Up @@ -1349,7 +1348,7 @@ def test_asset_download(api_client, storage, version, asset):
download = requests.get(download_url, timeout=5)
cd_header = download.headers.get('Content-Disposition')

assert cd_header == f'attachment; filename="{os.path.basename(asset.path)}"'
assert cd_header == f'attachment; filename="{asset.path.split("/")[-1]}"'

with asset.blob.blob.file.open('rb') as reader:
assert download.content == reader.read()
Expand Down Expand Up @@ -1395,7 +1394,7 @@ def test_asset_download_embargo(
download = requests.get(download_url, timeout=5)
cd_header = download.headers.get('Content-Disposition')

assert cd_header == f'attachment; filename="{os.path.basename(asset.path)}"'
assert cd_header == f'attachment; filename="{asset.path.split("/")[-1]}"'

with asset.embargoed_blob.blob.file.open('rb') as reader:
assert download.content == reader.read()
Expand Down Expand Up @@ -1430,7 +1429,7 @@ def test_asset_direct_download(api_client, storage, version, asset):
download = requests.get(download_url, timeout=5)
cd_header = download.headers.get('Content-Disposition')

assert cd_header == f'attachment; filename="{os.path.basename(asset.path)}"'
assert cd_header == f'attachment; filename="{asset.path.split("/")[-1]}"'

with asset.blob.blob.file.open('rb') as reader:
assert download.content == reader.read()
Expand Down Expand Up @@ -1462,7 +1461,7 @@ def test_asset_direct_download_head(api_client, storage, version, asset):
download = requests.get(download_url, timeout=5)
cd_header = download.headers.get('Content-Disposition')

assert cd_header == f'attachment; filename="{os.path.basename(asset.path)}"'
assert cd_header == f'attachment; filename="{asset.path.split("/")[-1]}"'

with asset.blob.blob.file.open('rb') as reader:
assert download.content == reader.read()
Expand Down
3 changes: 1 addition & 2 deletions dandiapi/api/tests/test_upload.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import uuid

from django.core.files.base import ContentFile
Expand Down Expand Up @@ -536,7 +535,7 @@ def test_upload_validate_wrong_size(api_client, user, upload):
api_client.force_authenticate(user=user)

wrong_content = b'not 100 bytes'
upload.blob.save(os.path.basename(upload.blob.name), ContentFile(wrong_content))
upload.blob.save(upload.blob.name.split('/')[-1], ContentFile(wrong_content))

resp = api_client.post(f'/api/uploads/{upload.upload_id}/validate/')
assert resp.status_code == 400
Expand Down
4 changes: 1 addition & 3 deletions dandiapi/api/views/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
# This should only be used for type interrogation, never instantiation
MinioStorage = type('FakeMinioStorage', (), {})

import os.path

from django.conf import settings
from django.db import transaction
from django.db.models import QuerySet
Expand Down Expand Up @@ -143,7 +141,7 @@ def download(self, request, *args, **kwargs):
serializer.is_valid(raise_exception=True)
content_disposition = serializer.validated_data['content_disposition']
content_type = asset.metadata.get('encodingFormat', 'application/octet-stream')
asset_basename = os.path.basename(asset.path)
asset_basename = asset.path.split('/')[-1]

if content_disposition == 'attachment':
return HttpResponseRedirect(
Expand Down
5 changes: 2 additions & 3 deletions dandiapi/zarr/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import logging
from pathlib import Path
from urllib.parse import urlparse, urlunparse
from uuid import uuid4

Expand Down Expand Up @@ -100,7 +99,7 @@ class ZarrArchive(BaseZarrArchive):
storage = get_storage()
dandiset = models.ForeignKey(Dandiset, related_name='zarr_archives', on_delete=models.CASCADE)

def s3_path(self, zarr_path: str | Path):
def s3_path(self, zarr_path: str) -> str:
"""Generate a full S3 object path from a path in this zarr_archive."""
return f'{settings.DANDI_DANDISETS_BUCKET_PREFIX}{settings.DANDI_ZARR_PREFIX_NAME}/{self.zarr_id}/{zarr_path}' # noqa: E501

Expand All @@ -111,6 +110,6 @@ class EmbargoedZarrArchive(BaseZarrArchive):
Dandiset, related_name='embargoed_zarr_archives', on_delete=models.CASCADE
)

def s3_path(self, zarr_path: str | Path):
def s3_path(self, zarr_path: str) -> str:
"""Generate a full S3 object path from a path in this zarr_archive."""
return f'{settings.DANDI_DANDISETS_EMBARGO_BUCKET_PREFIX}{settings.DANDI_ZARR_PREFIX_NAME}/{self.dandiset.identifier}/{self.zarr_id}/{zarr_path}' # noqa: E501
6 changes: 3 additions & 3 deletions dandiapi/zarr/tests/test_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def test_zarr_rest_delete_file(
f'/api/zarr/{zarr_archive.zarr_id}/files/', [{'path': str(zarr_file.path)}]
)
assert resp.status_code == 204
assert not zarr_archive.storage.exists(zarr_archive.s3_path(zarr_file.path))
assert not zarr_archive.storage.exists(zarr_archive.s3_path(str(zarr_file.path)))

# Assert zarr is back in pending state
zarr_archive.refresh_from_db()
Expand Down Expand Up @@ -334,7 +334,7 @@ def test_zarr_rest_delete_multiple_files(

# Assert not found
for file in zarr_files:
assert not zarr_archive.storage.exists(zarr_archive.s3_path(file))
assert not zarr_archive.storage.exists(zarr_archive.s3_path(str(file.path)))

ingest_zarr_archive(zarr_archive.zarr_id)
zarr_archive.refresh_from_db()
Expand Down Expand Up @@ -367,7 +367,7 @@ def test_zarr_rest_delete_missing_file(
assert resp.json() == [
f'File test-prefix/test-zarr/{zarr_archive.zarr_id}/does/not/exist does not exist.'
]
assert zarr_archive.storage.exists(zarr_archive.s3_path(zarr_file.path))
assert zarr_archive.storage.exists(zarr_archive.s3_path(str(zarr_file.path)))

# Ingest
zarr_archive.status = ZarrArchiveStatus.UPLOADED
Expand Down
10 changes: 4 additions & 6 deletions dandiapi/zarr/views/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import logging
from pathlib import Path

from django.db import IntegrityError, transaction
from django.db.models.query import QuerySet
Expand Down Expand Up @@ -193,17 +192,16 @@ def files(self, request, zarr_id: str):
serializer.is_valid(raise_exception=True)

# The root path for this zarr in s3
base_path = Path(zarr_archive.s3_path(''))
base_path = zarr_archive.s3_path('')

# Retrieve and join query params
limit = serializer.validated_data['limit']
download = serializer.validated_data['download']

raw_prefix = serializer.validated_data['prefix']
full_prefix = str(base_path / raw_prefix)
full_prefix = f'{base_path}/{raw_prefix}'

_after = serializer.validated_data['after']
after = str(base_path / _after) if _after else ''
after = f'{base_path}/{serializer.validated_data["after"]}'

# Handle head request redirects
if request.method == 'HEAD':
Expand All @@ -229,7 +227,7 @@ def files(self, request, zarr_id: str):
# Map/filter listing
results = [
{
'Key': str(Path(obj['Key']).relative_to(base_path)),
'Key': obj['Key'].removeprefix(base_path),
'LastModified': obj['LastModified'],
'ETag': obj['ETag'].strip('"'),
'Size': obj['Size'],
Expand Down

0 comments on commit d9d25cc

Please sign in to comment.