diff --git a/src/backend/core/migrations/00013_fix_content_types.py b/src/backend/core/migrations/00013_fix_content_types.py new file mode 100644 index 000000000..0700c255f --- /dev/null +++ b/src/backend/core/migrations/00013_fix_content_types.py @@ -0,0 +1,100 @@ +import magic +from django.db import migrations +from django.core.files.storage import default_storage + +def fix_attachments_content_type(apps, schema_editor): + """ + Iterate over all Document objects and fix ContentType for files in + /attachments/ prefix on default_storage (MinIO). + """ + + Document = apps.get_model('core', 'Document') + + s3_client = default_storage.connection.meta.client + bucket_name = default_storage.bucket_name + + mime_detector = magic.Magic(mime=True) + + documents = Document.objects.all() + print(f"[INFO] Found {documents.count()} documents. Starting ContentType fix...") + + for doc in documents: + doc_id_str = str(doc.id) + prefix = f"{doc_id_str}/attachments/" + print(f"[INFO] Processing attachments under prefix '{prefix}' ...") + + continuation_token = None + total_updated = 0 + + while True: + list_kwargs = { + "Bucket": bucket_name, + "Prefix": prefix + } + if continuation_token: + list_kwargs["ContinuationToken"] = continuation_token + + response = s3_client.list_objects_v2(**list_kwargs) + + # If no objects found under this prefix, break out of the loop + if "Contents" not in response: + break + + for obj in response["Contents"]: + key = obj["Key"] + + # Skip if it's a folder + if key.endswith("/"): + continue + + try: + # Get existing metadata + head_resp = s3_client.head_object(Bucket=bucket_name, Key=key) + old_metadata = head_resp.get("Metadata", {}) + + # Read first ~1KB for MIME detection + partial_obj = s3_client.get_object( + Bucket=bucket_name, + Key=key, + Range="bytes=0-1023" + ) + partial_data = partial_obj["Body"].read() + + # Detect MIME type + magic_mime_type = mime_detector.from_buffer(partial_data) + + # Update ContentType + s3_client.copy_object( + Bucket=bucket_name, + CopySource={"Bucket": bucket_name, "Key": key}, + Key=key, + ContentType=magic_mime_type, + Metadata=old_metadata, + MetadataDirective="REPLACE" + ) + total_updated += 1 + + except Exception as exc: + print(f"[ERROR] Could not update ContentType for {key}: {exc}") + + if response.get("IsTruncated"): + continuation_token = response.get("NextContinuationToken") + else: + break + + if(total_updated > 0): + print(f"[INFO] -> Updated {total_updated} objects for Document {doc_id_str}.") + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0012_make_document_creator_and_invitation_issuer_optional'), + ] + + operations = [ + migrations.RunPython( + fix_attachments_content_type, + reverse_code=migrations.RunPython.noop + ), + ] \ No newline at end of file diff --git a/src/backend/core/tests/migrations/test_00013_fix_content_types.py b/src/backend/core/tests/migrations/test_00013_fix_content_types.py new file mode 100644 index 000000000..12cd297f3 --- /dev/null +++ b/src/backend/core/tests/migrations/test_00013_fix_content_types.py @@ -0,0 +1,40 @@ +import uuid +import pytest +from core import factories +from django.core.files.storage import default_storage + +@pytest.mark.django_db +def test_fix_attachments_content_type_migration(migrator): + """ + Test that the migration fixes the ContentType of attachment in our storage. + """ + migrator.apply_initial_migration(('core', '0012_make_document_creator_and_invitation_issuer_optional')) + + doc_id = uuid.uuid4() + factories.DocumentFactory(id=doc_id) + + # Put a file with a *wrong* ContentType + s3_client = default_storage.connection.meta.client + bucket_name = default_storage.bucket_name + key = f"{doc_id}/attachments/testfile.png" + fake_png = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR..." + ) + s3_client.put_object( + Bucket=bucket_name, + Key=key, + Body=fake_png, + ContentType="text/plain", + Metadata={"owner": "None"} + ) + + # Apply the migration that fixes the ContentType + migrator.apply_tested_migration(('core', '00013_fix_content_types')) + + head_resp = s3_client.head_object(Bucket=bucket_name, Key=key) + assert head_resp["ContentType"] == "image/png", ( + f"ContentType not fixed, got {head_resp['ContentType']!r}" + ) + + # Check that original metadata was preserved + assert head_resp["Metadata"].get("owner") == "None" \ No newline at end of file