From e10a20838cd8e2eb81ece6fcb4d76ec73e31d8fe Mon Sep 17 00:00:00 2001 From: Greg Kempe Date: Wed, 26 Jul 2023 11:15:49 +0200 Subject: [PATCH] /source redirects to /source.pdf for PDF source files This ensures that documents that are pure PDF are only served from a single URL (/source.pdf) which is good for caching, google and pocketlaw. --- peachjam/models/core_document_model.py | 4 +-- peachjam/views/documents.py | 42 ++++++++++++++++++-------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/peachjam/models/core_document_model.py b/peachjam/models/core_document_model.py index e329c1e5a..bc9a9e0a0 100644 --- a/peachjam/models/core_document_model.py +++ b/peachjam/models/core_document_model.py @@ -731,9 +731,9 @@ def as_pdf(self): def filename_extension(self): return os.path.splitext(self.filename)[1][1:] - def filename_for_download(self): + def filename_for_download(self, ext=None): """Return a generated filename appropriate for use when downloading this source file.""" - ext = os.path.splitext(self.filename)[1] + ext = ext or os.path.splitext(self.filename)[1] title = re.sub(r"[^a-zA-Z0-9() ]", "", self.document.title) return title + ext diff --git a/peachjam/views/documents.py b/peachjam/views/documents.py index 43fe1ad8f..55c053e1f 100644 --- a/peachjam/views/documents.py +++ b/peachjam/views/documents.py @@ -1,7 +1,7 @@ from cobalt import FrbrUri from django.conf import settings from django.http import Http404, HttpResponse -from django.shortcuts import get_object_or_404, redirect +from django.shortcuts import get_object_or_404, redirect, reverse from django.utils.decorators import method_decorator from django.utils.translation import get_language from django.views.generic import DetailView, View @@ -160,31 +160,49 @@ def render_to_response(self, context, **response_kwargs): if hasattr(self.object, "source_file") and self.object.source_file.file: source_file = self.object.source_file + if source_file.mimetype == "application/pdf": + # If the source file is a PDF, redirect to the source.pdf URL + # This avoids providing an identical file from two different URLs, which is bad for caching, + # bad for Google, and bad for PocketLaw. + return redirect( + reverse( + "document_source_pdf", + kwargs={"frbr_uri": self.object.expression_frbr_uri[1:]}, + ) + ) + if source_file.source_url: return redirect(source_file.source_url) - file = source_file.file.open() - file_bytes = file.read() - response = HttpResponse(file_bytes, content_type=source_file.mimetype) - response[ - "Content-Disposition" - ] = f"inline; filename={source_file.filename_for_download()}" - response["Content-Length"] = str(len(file_bytes)) - return response + return self.make_response( + source_file.file.open(), + source_file.mimetype, + source_file.filename_for_download(), + ) raise Http404 + def make_response(self, f, content_type, fname): + file_bytes = f.read() + response = HttpResponse(file_bytes, content_type=content_type) + response["Content-Disposition"] = f"inline; filename={fname}" + response["Content-Length"] = str(len(file_bytes)) + return response + class DocumentSourcePDFView(DocumentSourceView): def render_to_response(self, context, **response_kwargs): if hasattr(self.object, "source_file"): source_file = self.object.source_file + # if the source file is remote and a pdf, just redirect there if source_file.source_url and source_file.mimetype == "application/pdf": return redirect(source_file.source_url) - file = source_file.as_pdf() - return HttpResponse(file.read(), content_type="application/pdf") - + return self.make_response( + source_file.as_pdf(), + "application/pdf", + source_file.filename_for_download(".pdf"), + ) raise Http404()