diff --git a/peachjam/admin.py b/peachjam/admin.py index 9c244606f..fb43f2082 100644 --- a/peachjam/admin.py +++ b/peachjam/admin.py @@ -325,6 +325,7 @@ class DocumentAdmin(admin.ModelAdmin): "reextract_content", "reindex_for_search", "apply_labels", + "ensure_source_file_pdf", ] fieldsets = [ @@ -516,6 +517,15 @@ def apply_labels(self, request, queryset): apply_labels.short_description = "Apply labels" + def ensure_source_file_pdf(self, request, queryset): + count = queryset.count() + for doc in queryset: + if hasattr(doc, "source_file"): + doc.source_file.ensure_file_as_pdf() + self.message_user(request, f"Ensuring PDF for {count} documents.") + + ensure_source_file_pdf.short_description = "Ensure PDF for source file" + def has_delete_permission(self, request, obj=None): if obj: if ( diff --git a/peachjam/forms.py b/peachjam/forms.py index 8bc0e496f..e4dfccc32 100644 --- a/peachjam/forms.py +++ b/peachjam/forms.py @@ -174,6 +174,7 @@ class SourceFileForm(AttachmentFormMixin, forms.ModelForm): class Meta: model = SourceFile fields = "__all__" + exclude = ("file_as_pdf",) def _save_m2m(self): super()._save_m2m() @@ -181,6 +182,10 @@ def _save_m2m(self): if self.instance.document.extract_content_from_source_file(): self.instance.document.save() + # if the file is changed, we need delete the existing pdf and re-generate + self.instance.file_as_pdf.delete() + self.instance.ensure_file_as_pdf() + class AttachedFilesForm(AttachmentFormMixin, forms.ModelForm): class Meta: diff --git a/peachjam/migrations/0096_sourcefile_file_as_pdf.py b/peachjam/migrations/0096_sourcefile_file_as_pdf.py new file mode 100644 index 000000000..e77ecd1aa --- /dev/null +++ b/peachjam/migrations/0096_sourcefile_file_as_pdf.py @@ -0,0 +1,26 @@ +# Generated by Django 3.2.19 on 2023-08-15 04:20 + +from django.db import migrations, models + +import peachjam.models.core_document_model + + +class Migration(migrations.Migration): + + dependencies = [ + ("peachjam", "0095_api_perms"), + ] + + operations = [ + migrations.AddField( + model_name="sourcefile", + name="file_as_pdf", + field=models.FileField( + blank=True, + max_length=1024, + null=True, + upload_to=peachjam.models.core_document_model.file_location, + verbose_name="file as pdf", + ), + ), + ] diff --git a/peachjam/models/core_document_model.py b/peachjam/models/core_document_model.py index 77530aefe..9de47c7b9 100644 --- a/peachjam/models/core_document_model.py +++ b/peachjam/models/core_document_model.py @@ -756,6 +756,14 @@ class SourceFile(AttachmentAbstractModel): _("source URL"), max_length=2048, null=True, blank=True ) + file_as_pdf = models.FileField( + _("file as pdf"), + upload_to=file_location, + max_length=1024, + null=True, + blank=True, + ) + class Meta: verbose_name = _("source file") verbose_name_plural = _("source files") @@ -763,10 +771,23 @@ class Meta: def as_pdf(self): if self.filename.endswith(".pdf"): return self.file + elif self.file_as_pdf: + return self.file_as_pdf + else: + return None + + def convert_to_pdf(self): + if self.mimetype != "application/pdf" and not self.file_as_pdf: + suffix = os.path.splitext(self.filename)[1].replace(".", "") + pdf = soffice_convert(self.file, suffix, "pdf")[0] + self.file_as_pdf = File(pdf, name=f"{self.file.name[:-5]}.pdf") + self.save() + + def ensure_file_as_pdf(self): + from peachjam.tasks import convert_source_file_to_pdf - # convert with soffice - suffix = os.path.splitext(self.filename)[1].replace(".", "") - return soffice_convert(self.file, suffix, "pdf")[0] + if self.mimetype != "application/pdf" and not self.file_as_pdf: + convert_source_file_to_pdf(self.id) def filename_extension(self): return os.path.splitext(self.filename)[1][1:] diff --git a/peachjam/signals.py b/peachjam/signals.py index 49e9e6239..79472dc9b 100644 --- a/peachjam/signals.py +++ b/peachjam/signals.py @@ -8,7 +8,7 @@ from django.db.models import signals from django.dispatch import receiver -from peachjam.models import CoreDocument, Work +from peachjam.models import CoreDocument, SourceFile, Work from peachjam.tasks import update_extracted_citations_for_a_work @@ -75,3 +75,10 @@ def doc_deleted_update_extracted_citations(sender, instance, **kwargs): """Update language list on related work after a subclass of CoreDocument is deleted.""" if isinstance(instance, CoreDocument): update_extracted_citations_for_a_work(instance.work_id) + + +@receiver(signals.post_save, sender=SourceFile) +def convert_to_pdf(sender, instance, created, **kwargs): + """Convert a source file to PDF when it's saved""" + if created: + instance.ensure_file_as_pdf() diff --git a/peachjam/tasks.py b/peachjam/tasks.py index d2e1570dc..8ead2342c 100644 --- a/peachjam/tasks.py +++ b/peachjam/tasks.py @@ -158,3 +158,24 @@ def update_extracted_citations_for_a_work(work_id): def re_extract_citations(): cp = citations_processor() cp.re_extract_citations() + + +@background(queue="peachjam", remove_existing_tasks=True) +def convert_source_file_to_pdf(source_file_id): + from peachjam.models import SourceFile + + source_file = SourceFile.objects.filter(pk=source_file_id).first() + if not source_file: + log.info(f"No source file with id {source_file_id} exists, ignoring.") + return + + log.info(f"Converting source file {source_file_id} to PDF") + + try: + source_file.convert_to_pdf() + + except Exception as e: + log.error(f"Error converting source file {source_file_id} to PDF", exc_info=e) + raise e + + log.info("Conversion to PDF done")