Skip to content

Commit

Permalink
Merge pull request #2065 from laws-africa/unique-images
Browse files Browse the repository at this point in the history
enforce unique image filenames on documents
  • Loading branch information
longhotsummer authored Sep 27, 2024
2 parents 1038d64 + 4de39a2 commit 98e14ae
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 1 deletion.
9 changes: 8 additions & 1 deletion peachjam/adapters/indigo.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,9 +350,16 @@ def download_and_save_document_images(self, document, created_document):
Image.objects.filter(document=created_document).delete()

image_list = self.list_images_from_content_api(document)
# we ignore duplicate filenames
filenames = set()
if image_list:
for result in image_list:
if result["mime_type"].startswith("image/"):
filename = result["filename"]
if (
result["mime_type"].startswith("image/")
and filename not in filenames
):
filenames.add(filename)
with NamedTemporaryFile() as file:
r = self.client_get(result["url"])
file.write(r.content)
Expand Down
40 changes: 40 additions & 0 deletions peachjam/migrations/0161_make_image_filenames_unique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Generated by Django 4.2.14 on 2024-09-26 14:17

from django.db import migrations
from django.db.models import Count


def remove_duplicate_images(apps, schema_editor):
Image = apps.get_model("peachjam", "Image")

# Find duplicates by grouping by document and filename
duplicates = (
Image.objects.values("document", "filename")
.annotate(count=Count("id"))
.filter(count__gt=1)
)

# Loop through each duplicate group
for dup in duplicates:
# Get all images with the same document and filename
images = Image.objects.filter(
document=dup["document"], filename=dup["filename"]
)

# Keep one and delete the rest
images.exclude(id=images.first().id).delete()


class Migration(migrations.Migration):

dependencies = [
("peachjam", "0160_ingestor_repeat_ingestor_schedule"),
]

operations = [
migrations.RunPython(remove_duplicate_images),
migrations.AlterUniqueTogether(
name="image",
unique_together={("document", "filename")},
),
]
1 change: 1 addition & 0 deletions peachjam/models/core_document_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,7 @@ class Image(AttachmentAbstractModel):
class Meta:
verbose_name = _("image")
verbose_name_plural = _("images")
unique_together = ("document", "filename")

@classmethod
def from_docpipe_attachment(cls, attachment):
Expand Down

0 comments on commit 98e14ae

Please sign in to comment.