From 7d95cec612751e5c7edc884daeb81831b40d6a52 Mon Sep 17 00:00:00 2001 From: Anthony LC Date: Wed, 15 Jan 2025 15:58:46 +0100 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=8F=B7=EF=B8=8F(backend)=20add=20cont?= =?UTF-8?q?ent-type=20to=20uploaded=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All the uploaded files had the content-type set to `application/octet-stream`. It create issues when the file is downloaded from the frontend because the browser doesn't know how to handle the file. We now determine the content-type of the file and set it to the file object. --- .github/workflows/impress.yml | 5 +-- CHANGELOG.md | 2 +- Dockerfile | 2 ++ src/backend/core/api/serializers.py | 2 ++ src/backend/core/api/viewsets.py | 5 ++- .../test_api_documents_attachment_upload.py | 31 ++++++++++++++----- 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/.github/workflows/impress.yml b/.github/workflows/impress.yml index 896f07efb..385aa633c 100644 --- a/.github/workflows/impress.yml +++ b/.github/workflows/impress.yml @@ -206,10 +206,11 @@ jobs: - name: Install development dependencies run: pip install --user .[dev] - - name: Install gettext (required to compile messages) + - name: Install gettext (required to compile messages) and MIME support run: | sudo apt-get update - sudo apt-get install -y gettext pandoc + sudo apt-get install -y gettext pandoc shared-mime-info + sudo wget https://svn.apache.org/repos/asf/httpd/httpd/trunk/docs/conf/mime.types -O /etc/mime.types - name: Generate a MO file from strings extracted from the project run: python manage.py compilemessages diff --git a/CHANGELOG.md b/CHANGELOG.md index 0805656a0..da6a6826f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ and this project adheres to - github actions to managed Crowdin workflow - 📈Integrate Posthog #540 - +- 🏷️(backend) add content-type to uploaded files #552 ## Changed diff --git a/Dockerfile b/Dockerfile index 941f07476..6547f0b65 100644 --- a/Dockerfile +++ b/Dockerfile @@ -76,6 +76,8 @@ RUN apk add \ pango \ shared-mime-info +RUN wget https://svn.apache.org/repos/asf/httpd/httpd/trunk/docs/conf/mime.types -O /etc/mime.types + # Copy entrypoint COPY ./docker/files/usr/local/bin/entrypoint /usr/local/bin/entrypoint diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index d5b0d4c7d..e2369f499 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -388,6 +388,7 @@ def validate_file(self, file): raise serializers.ValidationError("Could not determine file extension.") self.context["expected_extension"] = extension + self.context["content_type"] = magic_mime_type return file @@ -395,6 +396,7 @@ def validate(self, attrs): """Override validate to add the computed extension to validated_data.""" attrs["expected_extension"] = self.context["expected_extension"] attrs["is_unsafe"] = self.context["is_unsafe"] + attrs["content_type"] = self.context["content_type"] return attrs diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 02b0f2778..10adee356 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -605,7 +605,10 @@ def attachment_upload(self, request, *args, **kwargs): key = f"{document.key_base}/{ATTACHMENTS_FOLDER:s}/{file_id!s}.{extension:s}" # Prepare metadata for storage - extra_args = {"Metadata": {"owner": str(request.user.id)}} + extra_args = { + "Metadata": {"owner": str(request.user.id)}, + "ContentType": serializer.validated_data["content_type"], + } if serializer.validated_data["is_unsafe"]: extra_args["Metadata"]["is_unsafe"] = "true" diff --git a/src/backend/core/tests/documents/test_api_documents_attachment_upload.py b/src/backend/core/tests/documents/test_api_documents_attachment_upload.py index 1288f8ca6..4a6564d6a 100644 --- a/src/backend/core/tests/documents/test_api_documents_attachment_upload.py +++ b/src/backend/core/tests/documents/test_api_documents_attachment_upload.py @@ -64,12 +64,22 @@ def test_api_documents_attachment_upload_anonymous_success(): assert response.status_code == 201 pattern = re.compile(rf"^/media/{document.id!s}/attachments/(.*)\.png") - match = pattern.search(response.json()["file"]) + file_path = response.json()["file"] + match = pattern.search(file_path) file_id = match.group(1) # Validate that file_id is a valid UUID uuid.UUID(file_id) + # Now, check the metadata of the uploaded file + key = file_path.replace("/media", "") + file_head = default_storage.connection.meta.client.head_object( + Bucket=default_storage.bucket_name, Key=key + ) + + assert file_head["Metadata"] == {"owner": "None"} + assert file_head["ContentType"] == "image/png" + @pytest.mark.parametrize( "reach, role", @@ -206,6 +216,7 @@ def test_api_documents_attachment_upload_success(via, role, mock_user_teams): Bucket=default_storage.bucket_name, Key=key ) assert file_head["Metadata"] == {"owner": str(user.id)} + assert file_head["ContentType"] == "image/png" def test_api_documents_attachment_upload_invalid(client): @@ -247,16 +258,18 @@ def test_api_documents_attachment_upload_size_limit_exceeded(settings): @pytest.mark.parametrize( - "name,content,extension", + "name,content,extension,content_type", [ - ("test.exe", b"text", "exe"), - ("test", b"text", "txt"), - ("test.aaaaaa", b"test", "txt"), - ("test.txt", PIXEL, "txt"), - ("test.py", b"#!/usr/bin/python", "py"), + ("test.exe", b"text", "exe", "text/plain"), + ("test", b"text", "txt", "text/plain"), + ("test.aaaaaa", b"test", "txt", "text/plain"), + ("test.txt", PIXEL, "txt", "image/png"), + ("test.py", b"#!/usr/bin/python", "py", "text/plain"), ], ) -def test_api_documents_attachment_upload_fix_extension(name, content, extension): +def test_api_documents_attachment_upload_fix_extension( + name, content, extension, content_type +): """ A file with no extension or a wrong extension is accepted and the extension is corrected in storage. @@ -287,6 +300,7 @@ def test_api_documents_attachment_upload_fix_extension(name, content, extension) Bucket=default_storage.bucket_name, Key=key ) assert file_head["Metadata"] == {"owner": str(user.id), "is_unsafe": "true"} + assert file_head["ContentType"] == content_type def test_api_documents_attachment_upload_empty_file(): @@ -335,3 +349,4 @@ def test_api_documents_attachment_upload_unsafe(): Bucket=default_storage.bucket_name, Key=key ) assert file_head["Metadata"] == {"owner": str(user.id), "is_unsafe": "true"} + assert file_head["ContentType"] == "application/octet-stream" From 3e419f6a8dcc4321cef3d1c1502df226500d4287 Mon Sep 17 00:00:00 2001 From: Anthony LC Date: Mon, 20 Jan 2025 17:20:51 +0100 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=9A=91=EF=B8=8F(backend)=20command=20?= =?UTF-8?q?to=20update=20attachment=20content-type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The uploaded files in the system are missing the content-type. We add a command to update the content-type of the existing uploaded files. This command will run one time when we will deploy to the environments. --- src/backend/core/management/__init__.py | 0 .../core/management/commands/__init__.py | 0 .../update_files_content_type_metadata.py | 95 +++++++++++++++++++ ...test_update_files_content_type_metadata.py | 50 ++++++++++ 4 files changed, 145 insertions(+) create mode 100644 src/backend/core/management/__init__.py create mode 100644 src/backend/core/management/commands/__init__.py create mode 100644 src/backend/core/management/commands/update_files_content_type_metadata.py create mode 100644 src/backend/core/tests/commands/test_update_files_content_type_metadata.py diff --git a/src/backend/core/management/__init__.py b/src/backend/core/management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/backend/core/management/commands/__init__.py b/src/backend/core/management/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/backend/core/management/commands/update_files_content_type_metadata.py b/src/backend/core/management/commands/update_files_content_type_metadata.py new file mode 100644 index 000000000..bb2e5253a --- /dev/null +++ b/src/backend/core/management/commands/update_files_content_type_metadata.py @@ -0,0 +1,95 @@ +"""Management command updating the metadata for all the files in the MinIO bucket.""" + +from django.core.files.storage import default_storage +from django.core.management.base import BaseCommand + +import magic + +from core.models import Document + +# pylint: disable=too-many-locals, broad-exception-caught + + +class Command(BaseCommand): + """Update the metadata for all the files in the MinIO bucket.""" + + help = __doc__ + + def handle(self, *args, **options): + """Execute management command.""" + s3_client = default_storage.connection.meta.client + bucket_name = default_storage.bucket_name + + mime_detector = magic.Magic(mime=True) + + documents = Document.objects.all() + self.stdout.write( + f"[INFO] Found {documents.count()} documents. Starting ContentType fix..." + ) + + for doc in documents: + doc_id_str = str(doc.id) + prefix = f"{doc_id_str}/attachments/" + self.stdout.write( + f"[INFO] Processing attachments under prefix '{prefix}' ..." + ) + + continuation_token = None + total_updated = 0 + + while True: + list_kwargs = {"Bucket": bucket_name, "Prefix": prefix} + if continuation_token: + list_kwargs["ContinuationToken"] = continuation_token + + response = s3_client.list_objects_v2(**list_kwargs) + + # If no objects found under this prefix, break out of the loop + if "Contents" not in response: + break + + for obj in response["Contents"]: + key = obj["Key"] + + # Skip if it's a folder + if key.endswith("/"): + continue + + try: + # Get existing metadata + head_resp = s3_client.head_object(Bucket=bucket_name, Key=key) + + # Read first ~1KB for MIME detection + partial_obj = s3_client.get_object( + Bucket=bucket_name, Key=key, Range="bytes=0-1023" + ) + partial_data = partial_obj["Body"].read() + + # Detect MIME type + magic_mime_type = mime_detector.from_buffer(partial_data) + + # Update ContentType + s3_client.copy_object( + Bucket=bucket_name, + CopySource={"Bucket": bucket_name, "Key": key}, + Key=key, + ContentType=magic_mime_type, + Metadata=head_resp.get("Metadata", {}), + MetadataDirective="REPLACE", + ) + total_updated += 1 + + except Exception as exc: # noqa + self.stderr.write( + f"[ERROR] Could not update ContentType for {key}: {exc}" + ) + + if response.get("IsTruncated"): + continuation_token = response.get("NextContinuationToken") + else: + break + + if total_updated > 0: + self.stdout.write( + f"[INFO] -> Updated {total_updated} objects for Document {doc_id_str}." + ) diff --git a/src/backend/core/tests/commands/test_update_files_content_type_metadata.py b/src/backend/core/tests/commands/test_update_files_content_type_metadata.py new file mode 100644 index 000000000..4ece36140 --- /dev/null +++ b/src/backend/core/tests/commands/test_update_files_content_type_metadata.py @@ -0,0 +1,50 @@ +""" +Unit test for `update_files_content_type_metadata` command. +""" + +import uuid + +from django.core.files.storage import default_storage +from django.core.management import call_command + +import pytest + +from core import factories + + +@pytest.mark.django_db +def test_update_files_content_type_metadata(): + """ + Test that the command `update_files_content_type_metadata` + fixes the ContentType of attachment in the storage. + """ + s3_client = default_storage.connection.meta.client + bucket_name = default_storage.bucket_name + + # Create files with a wrong ContentType + keys = [] + for _ in range(10): + doc_id = uuid.uuid4() + factories.DocumentFactory(id=doc_id) + key = f"{doc_id}/attachments/testfile.png" + keys.append(key) + fake_png = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR..." + s3_client.put_object( + Bucket=bucket_name, + Key=key, + Body=fake_png, + ContentType="text/plain", + Metadata={"owner": "None"}, + ) + + # Call the command that fixes the ContentType + call_command("update_files_content_type_metadata") + + for key in keys: + head_resp = s3_client.head_object(Bucket=bucket_name, Key=key) + assert ( + head_resp["ContentType"] == "image/png" + ), f"ContentType not fixed, got {head_resp['ContentType']!r}" + + # Check that original metadata was preserved + assert head_resp["Metadata"].get("owner") == "None" From 0bfad91e5b98b74a368c1fa6791b0471644fcebe Mon Sep 17 00:00:00 2001 From: Manuel Raynaud Date: Fri, 24 Jan 2025 20:28:27 +0100 Subject: [PATCH 3/3] =?UTF-8?q?=E2=9C=A8(helm)=20add=20a=20job=20allowing?= =?UTF-8?q?=20to=20run=20arbitrary=20management=20command?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For a specific deployment we may need to run a specific management command, like the one added previously updating all files content-type. A template is added responsible to manage this case. The job will be created only if the backend.job.command is set. --- src/helm/impress/Chart.yaml | 2 +- src/helm/impress/README.md | 5 + src/helm/impress/templates/backend_job.yml | 124 ++++++++++++++++++ ...kend_job.yaml => backend_job_migrate.yaml} | 0 src/helm/impress/values.yaml | 13 ++ 5 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 src/helm/impress/templates/backend_job.yml rename src/helm/impress/templates/{backend_job.yaml => backend_job_migrate.yaml} (100%) diff --git a/src/helm/impress/Chart.yaml b/src/helm/impress/Chart.yaml index 749855d03..80115ecbe 100644 --- a/src/helm/impress/Chart.yaml +++ b/src/helm/impress/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 type: application name: docs -version: 2.0.1-beta.7 +version: 2.0.1-beta.8 appVersion: latest diff --git a/src/helm/impress/README.md b/src/helm/impress/README.md index 6b5587ad5..999dc4799 100644 --- a/src/helm/impress/README.md +++ b/src/helm/impress/README.md @@ -104,6 +104,11 @@ | `backend.migrate.restartPolicy` | backend migrate job restart policy | `Never` | | `backend.createsuperuser.command` | backend migrate command | `["/bin/sh","-c","python manage.py createsuperuser --email $DJANGO_SUPERUSER_EMAIL --password $DJANGO_SUPERUSER_PASSWORD\n"]` | | `backend.createsuperuser.restartPolicy` | backend migrate job restart policy | `Never` | +| `backend.job` | job dedicated to run a random management command, for example after a deployment | | +| `backend.job.name` | The name to use to describe this job | `""` | +| `backend.job.command` | The management command to execute | `[]` | +| `backend.job.restartPolicy` | The restart policy for the job. | `Never` | +| `backend.job.annotations` | Annotations to add to the job [default: argocd.argoproj.io/hook: PostSync] | | | `backend.probes.liveness.path` | Configure path for backend HTTP liveness probe | `/__heartbeat__` | | `backend.probes.liveness.targetPort` | Configure port for backend HTTP liveness probe | `undefined` | | `backend.probes.liveness.initialDelaySeconds` | Configure initial delay for backend liveness probe | `10` | diff --git a/src/helm/impress/templates/backend_job.yml b/src/helm/impress/templates/backend_job.yml new file mode 100644 index 000000000..4888a904a --- /dev/null +++ b/src/helm/impress/templates/backend_job.yml @@ -0,0 +1,124 @@ +{{- if .Values.backend.job.command -}} +{{- $envVars := include "impress.common.env" (list . .Values.backend) -}} +{{- $fullName := include "impress.backend.fullname" . -}} +{{- $component := "backend" -}} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ $fullName }}-{{ .Values.backend.job.name | default "random" | replace "_" "-" }} + namespace: {{ .Release.Namespace | quote }} + annotations: + argocd.argoproj.io/sync-options: Replace=true,Force=true + {{- with .Values.backend.job.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} + labels: + {{- include "impress.common.labels" (list . $component) | nindent 4 }} +spec: + template: + metadata: + annotations: + {{- with .Values.backend.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "impress.common.selectorLabels" (list . $component) | nindent 8 }} + spec: + {{- if $.Values.image.credentials }} + imagePullSecrets: + - name: {{ include "impress.secret.dockerconfigjson.name" (dict "fullname" (include "impress.fullname" .) "imageCredentials" $.Values.image.credentials) }} + {{- end}} + shareProcessNamespace: {{ .Values.backend.shareProcessNamespace }} + containers: + {{- with .Values.backend.sidecars }} + {{- toYaml . | nindent 8 }} + {{- end }} + - name: {{ .Chart.Name }} + image: "{{ (.Values.backend.image | default dict).repository | default .Values.image.repository }}:{{ (.Values.backend.image | default dict).tag | default .Values.image.tag }}" + imagePullPolicy: {{ (.Values.backend.image | default dict).pullPolicy | default .Values.image.pullPolicy }} + {{- with .Values.backend.job.command }} + command: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.backend.args }} + args: + {{- toYaml . | nindent 12 }} + {{- end }} + env: + {{- if $envVars}} + {{- $envVars | indent 12 }} + {{- end }} + {{- with .Values.backend.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.backend.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: + {{- range $index, $value := .Values.mountFiles }} + - name: "files-{{ $index }}" + mountPath: {{ $value.path }} + subPath: content + {{- end }} + {{- range $name, $volume := .Values.backend.persistence }} + - name: "{{ $name }}" + mountPath: "{{ $volume.mountPath }}" + {{- end }} + {{- range .Values.backend.extraVolumeMounts }} + - name: {{ .name }} + mountPath: {{ .mountPath }} + subPath: {{ .subPath | default "" }} + readOnly: {{ .readOnly }} + {{- end }} + {{- with .Values.backend.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.backend.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.backend.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + restartPolicy: {{ .Values.backend.job.restartPolicy }} + volumes: + {{- range $index, $value := .Values.mountFiles }} + - name: "files-{{ $index }}" + configMap: + name: "{{ include "impress.fullname" $ }}-files-{{ $index }}" + {{- end }} + {{- range $name, $volume := .Values.backend.persistence }} + - name: "{{ $name }}" + {{- if eq $volume.type "emptyDir" }} + emptyDir: {} + {{- else }} + persistentVolumeClaim: + claimName: "{{ $fullName }}-{{ $name }}" + {{- end }} + {{- end }} + {{- range .Values.backend.extraVolumes }} + - name: {{ .name }} + {{- if .existingClaim }} + persistentVolumeClaim: + claimName: {{ .existingClaim }} + {{- else if .hostPath }} + hostPath: + {{ toYaml .hostPath | nindent 12 }} + {{- else if .csi }} + csi: + {{- toYaml .csi | nindent 12 }} + {{- else if .configMap }} + configMap: + {{- toYaml .configMap | nindent 12 }} + {{- else if .emptyDir }} + emptyDir: + {{- toYaml .emptyDir | nindent 12 }} + {{- else }} + emptyDir: {} + {{- end }} + {{- end }} +{{- end }} diff --git a/src/helm/impress/templates/backend_job.yaml b/src/helm/impress/templates/backend_job_migrate.yaml similarity index 100% rename from src/helm/impress/templates/backend_job.yaml rename to src/helm/impress/templates/backend_job_migrate.yaml diff --git a/src/helm/impress/values.yaml b/src/helm/impress/values.yaml index 9f6924bba..d429937f7 100644 --- a/src/helm/impress/values.yaml +++ b/src/helm/impress/values.yaml @@ -251,6 +251,19 @@ backend: python manage.py createsuperuser --email $DJANGO_SUPERUSER_EMAIL --password $DJANGO_SUPERUSER_PASSWORD restartPolicy: Never + ## @extra backend.job job dedicated to run a random management command, for example after a deployment + ## @param backend.job.name The name to use to describe this job + ## @param backend.job.command The management command to execute + ## @param backend.job.restartPolicy The restart policy for the job. + ## @extra backend.job.annotations Annotations to add to the job [default: argocd.argoproj.io/hook: PostSync] + ## @skip backend.job.annotations.argocd.argoproj.io/hook + job: + name: "" + command: [] + restartPolicy: Never + annotations: + argocd.argoproj.io/hook: PostSync + ## @param backend.probes.liveness.path [nullable] Configure path for backend HTTP liveness probe ## @param backend.probes.liveness.targetPort [nullable] Configure port for backend HTTP liveness probe ## @param backend.probes.liveness.initialDelaySeconds [nullable] Configure initial delay for backend liveness probe