From e8d95facdf928558c3fe0935c70b77ed4b974a39 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Mon, 7 Oct 2024 20:10:42 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(backend)=20allow=20uploading=20more?= =?UTF-8?q?=20types=20of=20attachments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to allow users to upload files to a document, not just images. We try to enforce coherence between the file extension and the real mime type of its content. If a file is deemed unsafe, it is still accepted during upload and the information is stored as metadata on the object for display to readers. --- CHANGELOG.md | 1 + Dockerfile | 11 +- src/backend/core/api/serializers.py | 41 +++++- src/backend/core/api/viewsets.py | 18 ++- .../test_api_documents_attachment_upload.py | 130 ++++++++++++++---- src/backend/impress/settings.py | 75 +++++++++- src/backend/pyproject.toml | 1 + 7 files changed, 228 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66450106e..5be3ef36b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ and this project adheres to ## Added +- ✨(backend) allow uploading more types of attachments #309 - ✨(backend) add name fields to the user synchronized with OIDC #301 - ✨(ci) add security scan #291 - ♻️(frontend) Add versions #277 diff --git a/Dockerfile b/Dockerfile index f9570d402..f6378f9e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -65,14 +65,15 @@ ENV PYTHONUNBUFFERED=1 # Install required system libs RUN apk add \ - gettext \ cairo \ - libffi-dev \ + file \ + font-noto \ + font-noto-emoji \ + gettext \ gdk-pixbuf \ - pango \ + libffi-dev \ pandoc \ - font-noto-emoji \ - font-noto \ + pango \ shared-mime-info # Copy entrypoint diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index d98049f2f..269ce728a 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -6,6 +6,7 @@ from django.db.models import Q from django.utils.translation import gettext_lazy as _ +import magic from rest_framework import exceptions, serializers from core import models @@ -219,16 +220,42 @@ def validate_file(self, file): f"File size exceeds the maximum limit of {max_size:d} MB." ) - # Validate file type - mime_type, _ = mimetypes.guess_type(file.name) - if mime_type not in settings.DOCUMENT_IMAGE_ALLOWED_MIME_TYPES: - mime_types = ", ".join(settings.DOCUMENT_IMAGE_ALLOWED_MIME_TYPES) - raise serializers.ValidationError( - f"File type '{mime_type:s}' is not allowed. Allowed types are: {mime_types:s}" - ) + extension = file.name.rpartition(".")[-1] if "." in file.name else None + + # Read the first few bytes to determine the MIME type accurately + mime = magic.Magic(mime=True) + magic_mime_type = mime.from_buffer(file.read(1024)) + file.seek(0) # Reset file pointer to the beginning after reading + + self.context["is_unsafe"] = ( + magic_mime_type in settings.DOCUMENT_UNSAFE_MIME_TYPES + ) + + extension_mime_type, _ = mimetypes.guess_type(file.name) + + # Try guessing a coherent extension from the mimetype + if extension_mime_type != magic_mime_type: + self.context["is_unsafe"] = True + + guessed_ext = mimetypes.guess_extension(magic_mime_type) + # Missing extensions or extensions longer than 5 characters (it's as long as an extension + # can be) are replaced by the extension we eventually guessed from mimetype. + if (extension is None or len(extension) > 5) and guessed_ext: + extension = guessed_ext[1:] + + if extension is None: + raise serializers.ValidationError("Could not determine file extension.") + + self.context["expected_extension"] = extension return file + def validate(self, attrs): + """Override validate to add the computed extension to validated_data.""" + attrs["expected_extension"] = self.context["expected_extension"] + attrs["is_unsafe"] = self.context["is_unsafe"] + return attrs + class TemplateSerializer(BaseResourceSerializer): """Serialize templates.""" diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 23cb87aab..918ad426b 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -1,6 +1,5 @@ """API endpoints""" -import os import re import uuid from urllib.parse import urlparse @@ -476,15 +475,22 @@ def attachment_upload(self, request, *args, **kwargs): return drf_response.Response( serializer.errors, status=status.HTTP_400_BAD_REQUEST ) - # Extract the file extension from the original filename - file = serializer.validated_data["file"] - extension = os.path.splitext(file.name)[1] # Generate a generic yet unique filename to store the image in object storage file_id = uuid.uuid4() - key = f"{document.key_base}/{ATTACHMENTS_FOLDER:s}/{file_id!s}{extension:s}" + extension = serializer.validated_data["expected_extension"] + key = f"{document.key_base}/{ATTACHMENTS_FOLDER:s}/{file_id!s}.{extension:s}" + + # Prepare metadata for storage + metadata = {"Metadata": {"owner": str(request.user.id)}} + if serializer.validated_data["is_unsafe"]: + metadata["Metadata"]["is_unsafe"] = "true" + + file = serializer.validated_data["file"] + default_storage.connection.meta.client.upload_fileobj( + file, default_storage.bucket_name, key, ExtraArgs=metadata + ) - default_storage.save(key, file) return drf_response.Response( {"file": f"{settings.MEDIA_URL:s}{key:s}"}, status=status.HTTP_201_CREATED ) diff --git a/src/backend/core/tests/documents/test_api_documents_attachment_upload.py b/src/backend/core/tests/documents/test_api_documents_attachment_upload.py index 064d0c1fb..1288f8ca6 100644 --- a/src/backend/core/tests/documents/test_api_documents_attachment_upload.py +++ b/src/backend/core/tests/documents/test_api_documents_attachment_upload.py @@ -5,7 +5,7 @@ import re import uuid -from django.core.files.base import ContentFile +from django.core.files.storage import default_storage from django.core.files.uploadedfile import SimpleUploadedFile import pytest @@ -16,6 +16,12 @@ pytestmark = pytest.mark.django_db +PIXEL = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00" + b"\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\xf8\xff\xff?\x00\x05\xfe\x02\xfe" + b"\xa7V\xbd\xfa\x00\x00\x00\x00IEND\xaeB`\x82" +) + @pytest.mark.parametrize( "reach, role", @@ -33,7 +39,7 @@ def test_api_documents_attachment_upload_anonymous_forbidden(reach, role): and role don't allow it. """ document = factories.DocumentFactory(link_reach=reach, link_role=role) - file = SimpleUploadedFile("test_file.jpg", b"Dummy content") + file = SimpleUploadedFile(name="test.png", content=PIXEL, content_type="image/png") url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" response = APIClient().post(url, {"file": file}, format="multipart") @@ -50,14 +56,14 @@ def test_api_documents_attachment_upload_anonymous_success(): if the link reach and role permit it. """ document = factories.DocumentFactory(link_reach="public", link_role="editor") - file = SimpleUploadedFile("test_file.jpg", b"Dummy content") + file = SimpleUploadedFile(name="test.png", content=PIXEL, content_type="image/png") url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" response = APIClient().post(url, {"file": file}, format="multipart") assert response.status_code == 201 - pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.jpg") + pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.png") match = pattern.search(response.json()["file"]) file_id = match.group(1) @@ -85,7 +91,7 @@ def test_api_documents_attachment_upload_authenticated_forbidden(reach, role): client.force_login(user) document = factories.DocumentFactory(link_reach=reach, link_role=role) - file = SimpleUploadedFile("test_file.jpg", b"Dummy content") + file = SimpleUploadedFile(name="test.png", content=PIXEL, content_type="image/png") url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" response = client.post(url, {"file": file}, format="multipart") @@ -114,14 +120,14 @@ def test_api_documents_attachment_upload_authenticated_success(reach, role): client.force_login(user) document = factories.DocumentFactory(link_reach=reach, link_role=role) - file = SimpleUploadedFile("test_file.jpg", b"Dummy content") + file = SimpleUploadedFile(name="test.png", content=PIXEL, content_type="image/png") url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" response = client.post(url, {"file": file}, format="multipart") assert response.status_code == 201 - pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.jpg") + pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.png") match = pattern.search(response.json()["file"]) file_id = match.group(1) @@ -148,7 +154,7 @@ def test_api_documents_attachment_upload_reader(via, mock_user_teams): document=document, team="lasuite", role="reader" ) - file = SimpleUploadedFile("test_file.jpg", b"Dummy content") + file = SimpleUploadedFile(name="test.png", content=PIXEL, content_type="image/png") url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" response = client.post(url, {"file": file}, format="multipart") @@ -179,20 +185,28 @@ def test_api_documents_attachment_upload_success(via, role, mock_user_teams): document=document, team="lasuite", role=role ) - file = SimpleUploadedFile("test_file.jpg", b"Dummy content") + file = SimpleUploadedFile(name="test.png", content=PIXEL, content_type="image/png") url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" response = client.post(url, {"file": file}, format="multipart") assert response.status_code == 201 - pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.jpg") - match = pattern.search(response.json()["file"]) + file_path = response.json()["file"] + pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.png") + match = pattern.search(file_path) file_id = match.group(1) # Validate that file_id is a valid UUID uuid.UUID(file_id) + # Now, check the metadata of the uploaded file + key = file_path.replace("/media", "") + file_head = default_storage.connection.meta.client.head_object( + Bucket=default_storage.bucket_name, Key=key + ) + assert file_head["Metadata"] == {"owner": str(user.id)} + def test_api_documents_attachment_upload_invalid(client): """Attempt to upload without a file should return an explicit error.""" @@ -222,8 +236,9 @@ def test_api_documents_attachment_upload_size_limit_exceeded(settings): url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" # Create a temporary file larger than the allowed size - content = b"a" * (1048576 + 1) - file = ContentFile(content, name="test.jpg") + file = SimpleUploadedFile( + name="test.txt", content=b"a" * (1048576 + 1), content_type="text/plain" + ) response = client.post(url, {"file": file}, format="multipart") @@ -231,10 +246,21 @@ def test_api_documents_attachment_upload_size_limit_exceeded(settings): assert response.json() == {"file": ["File size exceeds the maximum limit of 1 MB."]} -def test_api_documents_attachment_upload_type_not_allowed(settings): - """The uploaded file should be of a whitelisted type.""" - settings.DOCUMENT_IMAGE_ALLOWED_MIME_TYPES = ["image/jpeg", "image/png"] - +@pytest.mark.parametrize( + "name,content,extension", + [ + ("test.exe", b"text", "exe"), + ("test", b"text", "txt"), + ("test.aaaaaa", b"test", "txt"), + ("test.txt", PIXEL, "txt"), + ("test.py", b"#!/usr/bin/python", "py"), + ], +) +def test_api_documents_attachment_upload_fix_extension(name, content, extension): + """ + A file with no extension or a wrong extension is accepted and the extension + is corrected in storage. + """ user = factories.UserFactory() client = APIClient() client.force_login(user) @@ -242,14 +268,70 @@ def test_api_documents_attachment_upload_type_not_allowed(settings): document = factories.DocumentFactory(users=[(user, "owner")]) url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" - # Create a temporary file with a not allowed type (e.g., text file) - file = ContentFile(b"a" * 1048576, name="test.txt") + file = SimpleUploadedFile(name=name, content=content) + response = client.post(url, {"file": file}, format="multipart") + + assert response.status_code == 201 + + file_path = response.json()["file"] + pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.{extension:s}") + match = pattern.search(file_path) + file_id = match.group(1) + + # Validate that file_id is a valid UUID + uuid.UUID(file_id) + + # Now, check the metadata of the uploaded file + key = file_path.replace("/media", "") + file_head = default_storage.connection.meta.client.head_object( + Bucket=default_storage.bucket_name, Key=key + ) + assert file_head["Metadata"] == {"owner": str(user.id), "is_unsafe": "true"} + + +def test_api_documents_attachment_upload_empty_file(): + """An empty file should be rejected.""" + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory(users=[(user, "owner")]) + url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" + file = SimpleUploadedFile(name="test.png", content=b"") response = client.post(url, {"file": file}, format="multipart") assert response.status_code == 400 - assert response.json() == { - "file": [ - "File type 'text/plain' is not allowed. Allowed types are: image/jpeg, image/png" - ] - } + assert response.json() == {"file": ["The submitted file is empty."]} + + +def test_api_documents_attachment_upload_unsafe(): + """A file with an unsafe mime type should be tagged as such.""" + user = factories.UserFactory() + client = APIClient() + client.force_login(user) + + document = factories.DocumentFactory(users=[(user, "owner")]) + url = f"/api/v1.0/documents/{document.id!s}/attachment-upload/" + + file = SimpleUploadedFile( + name="script.exe", content=b"\x4d\x5a\x90\x00\x03\x00\x00\x00" + ) + response = client.post(url, {"file": file}, format="multipart") + + assert response.status_code == 201 + + file_path = response.json()["file"] + pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.exe") + match = pattern.search(file_path) + file_id = match.group(1) + + # Validate that file_id is a valid UUID + uuid.UUID(file_id) + + # Now, check the metadata of the uploaded file + key = file_path.replace("/media", "") + file_head = default_storage.connection.meta.client.head_object( + Bucket=default_storage.bucket_name, Key=key + ) + assert file_head["Metadata"] == {"owner": str(user.id), "is_unsafe": "true"} diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index a4812c445..7d06a1c7b 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -144,14 +144,75 @@ class Base(Configuration): environ_name="DOCUMENT_IMAGE_MAX_SIZE", environ_prefix=None, ) - DOCUMENT_IMAGE_ALLOWED_MIME_TYPES = [ - "image/bmp", - "image/gif", - "image/jpeg", - "image/png", + + DOCUMENT_UNSAFE_MIME_TYPES = [ + # Executable Files + "application/x-msdownload", + "application/x-bat", + "application/x-dosexec", + "application/x-sh", + "application/x-ms-dos-executable", + "application/x-msi", + "application/java-archive", + "application/octet-stream", + # Dynamic Web Pages + "application/x-httpd-php", + "application/x-asp", + "application/x-aspx", + "application/jsp", + "application/xhtml+xml", + "application/x-python-code", + "application/x-perl", + "text/html", + "text/javascript", + "text/x-php", + # System Files + "application/x-msdownload", + "application/x-sys", + "application/x-drv", + "application/cpl", + "application/x-apple-diskimage", + # Script Files + "application/javascript", + "application/x-vbscript", + "application/x-powershell", + "application/x-shellscript", + # Compressed/Archive Files + "application/zip", + "application/x-tar", + "application/gzip", + "application/x-bzip2", + "application/x-7z-compressed", + "application/x-rar", + "application/x-rar-compressed", + "application/x-compress", + "application/x-lzma", + # Macros in Documents + "application/vnd.ms-word", + "application/vnd.ms-excel", + "application/vnd.ms-powerpoint", + "application/vnd.ms-word.document.macroenabled.12", + "application/vnd.ms-excel.sheet.macroenabled.12", + "application/vnd.ms-powerpoint.presentation.macroenabled.12", + # Disk Images & Virtual Disk Files + "application/x-iso9660-image", + "application/x-vmdk", + "application/x-apple-diskimage", + "application/x-dmg", + # Other Dangerous MIME Types + "application/x-ms-application", + "application/x-msdownload", + "application/x-shockwave-flash", + "application/x-silverlight-app", + "application/x-java-vm", + "application/x-bittorrent", + "application/hta", + "application/x-csh", + "application/x-ksh", + "application/x-ms-regedit", + "application/x-msdownload", + "application/xml", "image/svg+xml", - "image/tiff", - "image/webp", ] # Document versions diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index 25a3c32e2..3c3805ce6 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -50,6 +50,7 @@ dependencies = [ "PyJWT==2.9.0", "pypandoc==1.14", "python-frontmatter==1.1.0", + "python-magic==0.4.27", "requests==2.32.3", "sentry-sdk==2.16.0", "url-normalize==1.4.3",