Add parsing of PDF titles

This commit attempts to parse the title metadata in a PDF file if it exists, otherwise it fallbacks to printing only the media type and size. We use the `pdf` crate to parse all of the response body (for some reason metadata and table of contents are kept at the end of PDFs) and then ask the PDF for the title that may be defined in the "[info dictionary]". [info dictionary]: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
nuxeh · Apr 12, 2021 · 72555b9 · 72555b9
1 parent b4175be
commit 72555b9
Show file tree

Hide file tree

Showing 5 changed files with 273 additions and 12 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -51,6 +51,7 @@ stderrlog = "0.5.1"
 atty = "0.2.14"
 scraper = { version = "0.12.0", default-features = false, features = [] }
 phf = "0.7.24"
+pdf = "0.7.1"
 
 [dependencies.image]
 version = "0.22.5"