Skip to content

Commit

Permalink
converts source files to pdf proactively
Browse files Browse the repository at this point in the history
  • Loading branch information
actlikewill committed Aug 15, 2023
1 parent a6ca940 commit d242b90
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 4 deletions.
10 changes: 10 additions & 0 deletions peachjam/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ class DocumentAdmin(admin.ModelAdmin):
"reextract_content",
"reindex_for_search",
"apply_labels",
"ensure_source_file_pdf",
]

fieldsets = [
Expand Down Expand Up @@ -516,6 +517,15 @@ def apply_labels(self, request, queryset):

apply_labels.short_description = "Apply labels"

def ensure_source_file_pdf(self, request, queryset):
count = queryset.count()
for doc in queryset:
if hasattr(doc, "source_file"):
doc.source_file.ensure_file_as_pdf()
self.message_user(request, f"Ensuring PDF for {count} documents.")

ensure_source_file_pdf.short_description = "Ensure PDF for source file"

def has_delete_permission(self, request, obj=None):
if obj:
if (
Expand Down
5 changes: 5 additions & 0 deletions peachjam/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,18 @@ class SourceFileForm(AttachmentFormMixin, forms.ModelForm):
class Meta:
model = SourceFile
fields = "__all__"
exclude = ("file_as_pdf",)

def _save_m2m(self):
super()._save_m2m()
if "file" in self.changed_data:
if self.instance.document.extract_content_from_source_file():
self.instance.document.save()

# if the file is changed, we need delete the existing pdf and re-generate
self.instance.file_as_pdf.delete()
self.instance.ensure_file_as_pdf()


class AttachedFilesForm(AttachmentFormMixin, forms.ModelForm):
class Meta:
Expand Down
26 changes: 26 additions & 0 deletions peachjam/migrations/0096_sourcefile_file_as_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Generated by Django 3.2.19 on 2023-08-15 04:20

from django.db import migrations, models

import peachjam.models.core_document_model


class Migration(migrations.Migration):

dependencies = [
("peachjam", "0095_api_perms"),
]

operations = [
migrations.AddField(
model_name="sourcefile",
name="file_as_pdf",
field=models.FileField(
blank=True,
max_length=1024,
null=True,
upload_to=peachjam.models.core_document_model.file_location,
verbose_name="file as pdf",
),
),
]
27 changes: 24 additions & 3 deletions peachjam/models/core_document_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,17 +756,38 @@ class SourceFile(AttachmentAbstractModel):
_("source URL"), max_length=2048, null=True, blank=True
)

file_as_pdf = models.FileField(
_("file as pdf"),
upload_to=file_location,
max_length=1024,
null=True,
blank=True,
)

class Meta:
verbose_name = _("source file")
verbose_name_plural = _("source files")

def as_pdf(self):
if self.filename.endswith(".pdf"):
return self.file
elif self.file_as_pdf:
return self.file_as_pdf
else:
return None

def convert_to_pdf(self):
if self.mimetype != "application/pdf" and not self.file_as_pdf:
suffix = os.path.splitext(self.filename)[1].replace(".", "")
pdf = soffice_convert(self.file, suffix, "pdf")[0]
self.file_as_pdf = File(pdf, name=f"{self.file.name[:-5]}.pdf")
self.save()

def ensure_file_as_pdf(self):
from peachjam.tasks import convert_source_file_to_pdf

# convert with soffice
suffix = os.path.splitext(self.filename)[1].replace(".", "")
return soffice_convert(self.file, suffix, "pdf")[0]
if self.mimetype != "application/pdf" and not self.file_as_pdf:
convert_source_file_to_pdf(self.id)

def filename_extension(self):
return os.path.splitext(self.filename)[1][1:]
Expand Down
9 changes: 8 additions & 1 deletion peachjam/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from django.db.models import signals
from django.dispatch import receiver

from peachjam.models import CoreDocument, Work
from peachjam.models import CoreDocument, SourceFile, Work
from peachjam.tasks import update_extracted_citations_for_a_work


Expand Down Expand Up @@ -75,3 +75,10 @@ def doc_deleted_update_extracted_citations(sender, instance, **kwargs):
"""Update language list on related work after a subclass of CoreDocument is deleted."""
if isinstance(instance, CoreDocument):
update_extracted_citations_for_a_work(instance.work_id)


@receiver(signals.post_save, sender=SourceFile)
def convert_to_pdf(sender, instance, created, **kwargs):
"""Convert a source file to PDF when it's saved"""
if created:
instance.ensure_file_as_pdf()
21 changes: 21 additions & 0 deletions peachjam/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,24 @@ def update_extracted_citations_for_a_work(work_id):
def re_extract_citations():
cp = citations_processor()
cp.re_extract_citations()


@background(queue="peachjam", remove_existing_tasks=True)
def convert_source_file_to_pdf(source_file_id):
from peachjam.models import SourceFile

source_file = SourceFile.objects.filter(pk=source_file_id).first()
if not source_file:
log.info(f"No source file with id {source_file_id} exists, ignoring.")
return

log.info(f"Converting source file {source_file_id} to PDF")

try:
source_file.convert_to_pdf()

except Exception as e:
log.error(f"Error converting source file {source_file_id} to PDF", exc_info=e)
raise e

log.info("Conversion to PDF done")

0 comments on commit d242b90

Please sign in to comment.