Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enforce unique image filenames on documents #2065

Merged
merged 1 commit into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion peachjam/adapters/indigo.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,16 @@ def download_and_save_document_images(self, document, created_document):
Image.objects.filter(document=created_document).delete()

image_list = self.list_images_from_content_api(document)
# we ignore duplicate filenames
filenames = set()
if image_list:
for result in image_list:
if result["mime_type"].startswith("image/"):
filename = result["filename"]
if (
result["mime_type"].startswith("image/")
and filename not in filenames
):
filenames.add(filename)
with NamedTemporaryFile() as file:
r = self.client_get(result["url"])
file.write(r.content)
Expand Down
40 changes: 40 additions & 0 deletions peachjam/migrations/0161_make_image_filenames_unique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Generated by Django 4.2.14 on 2024-09-26 14:17

from django.db import migrations
from django.db.models import Count


def remove_duplicate_images(apps, schema_editor):
Image = apps.get_model("peachjam", "Image")

# Find duplicates by grouping by document and filename
duplicates = (
Image.objects.values("document", "filename")
.annotate(count=Count("id"))
.filter(count__gt=1)
)

# Loop through each duplicate group
for dup in duplicates:
# Get all images with the same document and filename
images = Image.objects.filter(
document=dup["document"], filename=dup["filename"]
)

# Keep one and delete the rest
images.exclude(id=images.first().id).delete()


class Migration(migrations.Migration):

dependencies = [
("peachjam", "0160_ingestor_repeat_ingestor_schedule"),
]

operations = [
migrations.RunPython(remove_duplicate_images),
migrations.AlterUniqueTogether(
name="image",
unique_together={("document", "filename")},
),
]
1 change: 1 addition & 0 deletions peachjam/models/core_document_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,7 @@ class Image(AttachmentAbstractModel):
class Meta:
verbose_name = _("image")
verbose_name_plural = _("images")
unique_together = ("document", "filename")

@classmethod
def from_docpipe_attachment(cls, attachment):
Expand Down
Loading