Skip to content

🏷️(backend) add content-type to uploaded files #552

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/impress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,11 @@ jobs:
- name: Install development dependencies
run: pip install --user .[dev]

- name: Install gettext (required to compile messages)
- name: Install gettext (required to compile messages) and MIME support
run: |
sudo apt-get update
sudo apt-get install -y gettext pandoc
sudo apt-get install -y gettext pandoc shared-mime-info
sudo wget https://svn.apache.org/repos/asf/httpd/httpd/trunk/docs/conf/mime.types -O /etc/mime.types

- name: Generate a MO file from strings extracted from the project
run: python manage.py compilemessages
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ and this project adheres to

- github actions to managed Crowdin workflow
- 📈Integrate Posthog #540

- 🏷️(backend) add content-type to uploaded files #552

## Changed

Expand Down
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ RUN apk add \
pango \
shared-mime-info

RUN wget https://svn.apache.org/repos/asf/httpd/httpd/trunk/docs/conf/mime.types -O /etc/mime.types

# Copy entrypoint
COPY ./docker/files/usr/local/bin/entrypoint /usr/local/bin/entrypoint

Expand Down
2 changes: 2 additions & 0 deletions src/backend/core/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,13 +388,15 @@ def validate_file(self, file):
raise serializers.ValidationError("Could not determine file extension.")

self.context["expected_extension"] = extension
self.context["content_type"] = magic_mime_type

return file

def validate(self, attrs):
"""Override validate to add the computed extension to validated_data."""
attrs["expected_extension"] = self.context["expected_extension"]
attrs["is_unsafe"] = self.context["is_unsafe"]
attrs["content_type"] = self.context["content_type"]
return attrs


Expand Down
5 changes: 4 additions & 1 deletion src/backend/core/api/viewsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,10 @@ def attachment_upload(self, request, *args, **kwargs):
key = f"{document.key_base}/{ATTACHMENTS_FOLDER:s}/{file_id!s}.{extension:s}"

# Prepare metadata for storage
extra_args = {"Metadata": {"owner": str(request.user.id)}}
extra_args = {
"Metadata": {"owner": str(request.user.id)},
"ContentType": serializer.validated_data["content_type"],
}
if serializer.validated_data["is_unsafe"]:
extra_args["Metadata"]["is_unsafe"] = "true"

Expand Down
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Management command updating the metadata for all the files in the MinIO bucket."""

from django.core.files.storage import default_storage
from django.core.management.base import BaseCommand

import magic

from core.models import Document

# pylint: disable=too-many-locals, broad-exception-caught


class Command(BaseCommand):
"""Update the metadata for all the files in the MinIO bucket."""

help = __doc__

def handle(self, *args, **options):
"""Execute management command."""
s3_client = default_storage.connection.meta.client
bucket_name = default_storage.bucket_name

mime_detector = magic.Magic(mime=True)

documents = Document.objects.all()
self.stdout.write(
f"[INFO] Found {documents.count()} documents. Starting ContentType fix..."
)

for doc in documents:
doc_id_str = str(doc.id)
prefix = f"{doc_id_str}/attachments/"
self.stdout.write(
f"[INFO] Processing attachments under prefix '{prefix}' ..."
)

continuation_token = None
total_updated = 0

while True:
list_kwargs = {"Bucket": bucket_name, "Prefix": prefix}
if continuation_token:
list_kwargs["ContinuationToken"] = continuation_token

response = s3_client.list_objects_v2(**list_kwargs)

# If no objects found under this prefix, break out of the loop
if "Contents" not in response:
break

for obj in response["Contents"]:
key = obj["Key"]

# Skip if it's a folder
if key.endswith("/"):
continue

try:
# Get existing metadata
head_resp = s3_client.head_object(Bucket=bucket_name, Key=key)

# Read first ~1KB for MIME detection
partial_obj = s3_client.get_object(
Bucket=bucket_name, Key=key, Range="bytes=0-1023"
)
partial_data = partial_obj["Body"].read()

# Detect MIME type
magic_mime_type = mime_detector.from_buffer(partial_data)

# Update ContentType
s3_client.copy_object(
Bucket=bucket_name,
CopySource={"Bucket": bucket_name, "Key": key},
Key=key,
ContentType=magic_mime_type,
Metadata=head_resp.get("Metadata", {}),
MetadataDirective="REPLACE",
)
total_updated += 1

except Exception as exc: # noqa
self.stderr.write(
f"[ERROR] Could not update ContentType for {key}: {exc}"
)

if response.get("IsTruncated"):
continuation_token = response.get("NextContinuationToken")
else:
break

if total_updated > 0:
self.stdout.write(
f"[INFO] -> Updated {total_updated} objects for Document {doc_id_str}."
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Unit test for `update_files_content_type_metadata` command.
"""

import uuid

from django.core.files.storage import default_storage
from django.core.management import call_command

import pytest

from core import factories


@pytest.mark.django_db
def test_update_files_content_type_metadata():
"""
Test that the command `update_files_content_type_metadata`
fixes the ContentType of attachment in the storage.
"""
s3_client = default_storage.connection.meta.client
bucket_name = default_storage.bucket_name

# Create files with a wrong ContentType
keys = []
for _ in range(10):
doc_id = uuid.uuid4()
factories.DocumentFactory(id=doc_id)
key = f"{doc_id}/attachments/testfile.png"
keys.append(key)
fake_png = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR..."
s3_client.put_object(
Bucket=bucket_name,
Key=key,
Body=fake_png,
ContentType="text/plain",
Metadata={"owner": "None"},
)

# Call the command that fixes the ContentType
call_command("update_files_content_type_metadata")

for key in keys:
head_resp = s3_client.head_object(Bucket=bucket_name, Key=key)
assert (
head_resp["ContentType"] == "image/png"
), f"ContentType not fixed, got {head_resp['ContentType']!r}"

# Check that original metadata was preserved
assert head_resp["Metadata"].get("owner") == "None"
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,22 @@ def test_api_documents_attachment_upload_anonymous_success():
assert response.status_code == 201

pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.png")
match = pattern.search(response.json()["file"])
file_path = response.json()["file"]
match = pattern.search(file_path)
file_id = match.group(1)

# Validate that file_id is a valid UUID
uuid.UUID(file_id)

# Now, check the metadata of the uploaded file
key = file_path.replace("/media", "")
file_head = default_storage.connection.meta.client.head_object(
Bucket=default_storage.bucket_name, Key=key
)

assert file_head["Metadata"] == {"owner": "None"}
assert file_head["ContentType"] == "image/png"


@pytest.mark.parametrize(
"reach, role",
Expand Down Expand Up @@ -206,6 +216,7 @@ def test_api_documents_attachment_upload_success(via, role, mock_user_teams):
Bucket=default_storage.bucket_name, Key=key
)
assert file_head["Metadata"] == {"owner": str(user.id)}
assert file_head["ContentType"] == "image/png"


def test_api_documents_attachment_upload_invalid(client):
Expand Down Expand Up @@ -247,16 +258,18 @@ def test_api_documents_attachment_upload_size_limit_exceeded(settings):


@pytest.mark.parametrize(
"name,content,extension",
"name,content,extension,content_type",
[
("test.exe", b"text", "exe"),
("test", b"text", "txt"),
("test.aaaaaa", b"test", "txt"),
("test.txt", PIXEL, "txt"),
("test.py", b"#!/usr/bin/python", "py"),
("test.exe", b"text", "exe", "text/plain"),
("test", b"text", "txt", "text/plain"),
("test.aaaaaa", b"test", "txt", "text/plain"),
("test.txt", PIXEL, "txt", "image/png"),
("test.py", b"#!/usr/bin/python", "py", "text/plain"),
],
)
def test_api_documents_attachment_upload_fix_extension(name, content, extension):
def test_api_documents_attachment_upload_fix_extension(
name, content, extension, content_type
):
"""
A file with no extension or a wrong extension is accepted and the extension
is corrected in storage.
Expand Down Expand Up @@ -287,6 +300,7 @@ def test_api_documents_attachment_upload_fix_extension(name, content, extension)
Bucket=default_storage.bucket_name, Key=key
)
assert file_head["Metadata"] == {"owner": str(user.id), "is_unsafe": "true"}
assert file_head["ContentType"] == content_type


def test_api_documents_attachment_upload_empty_file():
Expand Down Expand Up @@ -335,3 +349,4 @@ def test_api_documents_attachment_upload_unsafe():
Bucket=default_storage.bucket_name, Key=key
)
assert file_head["Metadata"] == {"owner": str(user.id), "is_unsafe": "true"}
assert file_head["ContentType"] == "application/octet-stream"
2 changes: 1 addition & 1 deletion src/helm/impress/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
apiVersion: v2
type: application
name: docs
version: 2.0.1-beta.7
version: 2.0.1-beta.8
appVersion: latest
5 changes: 5 additions & 0 deletions src/helm/impress/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@
| `backend.migrate.restartPolicy` | backend migrate job restart policy | `Never` |
| `backend.createsuperuser.command` | backend migrate command | `["/bin/sh","-c","python manage.py createsuperuser --email $DJANGO_SUPERUSER_EMAIL --password $DJANGO_SUPERUSER_PASSWORD\n"]` |
| `backend.createsuperuser.restartPolicy` | backend migrate job restart policy | `Never` |
| `backend.job` | job dedicated to run a random management command, for example after a deployment | |
| `backend.job.name` | The name to use to describe this job | `""` |
| `backend.job.command` | The management command to execute | `[]` |
| `backend.job.restartPolicy` | The restart policy for the job. | `Never` |
| `backend.job.annotations` | Annotations to add to the job [default: argocd.argoproj.io/hook: PostSync] | |
| `backend.probes.liveness.path` | Configure path for backend HTTP liveness probe | `/__heartbeat__` |
| `backend.probes.liveness.targetPort` | Configure port for backend HTTP liveness probe | `undefined` |
| `backend.probes.liveness.initialDelaySeconds` | Configure initial delay for backend liveness probe | `10` |
Expand Down
Loading
Loading