diff --git a/peachjam/adapters/indigo.py b/peachjam/adapters/indigo.py index 062edf9f0..6a6ffc956 100644 --- a/peachjam/adapters/indigo.py +++ b/peachjam/adapters/indigo.py @@ -339,9 +339,16 @@ def download_and_save_document_images(self, document, created_document): Image.objects.filter(document=created_document).delete() image_list = self.list_images_from_content_api(document) + # we ignore duplicate filenames + filenames = set() if image_list: for result in image_list: - if result["mime_type"].startswith("image/"): + filename = result["filename"] + if ( + result["mime_type"].startswith("image/") + and filename not in filenames + ): + filenames.add(filename) with NamedTemporaryFile() as file: r = self.client_get(result["url"]) file.write(r.content) diff --git a/peachjam/migrations/0161_make_image_filenames_unique.py b/peachjam/migrations/0161_make_image_filenames_unique.py new file mode 100644 index 000000000..39a8bb2ad --- /dev/null +++ b/peachjam/migrations/0161_make_image_filenames_unique.py @@ -0,0 +1,40 @@ +# Generated by Django 4.2.14 on 2024-09-26 14:17 + +from django.db import migrations +from django.db.models import Count + + +def remove_duplicate_images(apps, schema_editor): + Image = apps.get_model("peachjam", "Image") + + # Find duplicates by grouping by document and filename + duplicates = ( + Image.objects.values("document", "filename") + .annotate(count=Count("id")) + .filter(count__gt=1) + ) + + # Loop through each duplicate group + for dup in duplicates: + # Get all images with the same document and filename + images = Image.objects.filter( + document=dup["document"], filename=dup["filename"] + ) + + # Keep one and delete the rest + images.exclude(id=images.first().id).delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ("peachjam", "0160_ingestor_repeat_ingestor_schedule"), + ] + + operations = [ + migrations.RunPython(remove_duplicate_images), + migrations.AlterUniqueTogether( + name="image", + unique_together={("document", "filename")}, + ), + ] diff --git a/peachjam/models/core_document_model.py b/peachjam/models/core_document_model.py index b04b1b673..466f76be2 100644 --- a/peachjam/models/core_document_model.py +++ b/peachjam/models/core_document_model.py @@ -829,6 +829,7 @@ class Image(AttachmentAbstractModel): class Meta: verbose_name = _("image") verbose_name_plural = _("images") + unique_together = ("document", "filename") @classmethod def from_docpipe_attachment(cls, attachment):