From 4b86c6636d086ca7ea4780c07c2d7390321982b5 Mon Sep 17 00:00:00 2001 From: Alastair Irvine Date: Mon, 20 May 2024 14:14:59 +0800 Subject: [PATCH] Fix for two bugs related to Unicode translation support by Font objects (#698) * Fix for two bugs related to Unicode translation support by Font objects Symptom was that some documents' contents was rendering as a bunch of control characters. These are the untranslated strings. This was happening because for two different reasons, these strings weren't being translated \Smalot\PdfParser\Font::decodeContent() in some circumstances. First fix is to \Smalot\PdfParser\Font::loadTranslateTable(): - Fixed bug where bfchar sections weren't loaded due to mistake in regexp. - It now uses `*` instead of `+` and thus supports translation tables with lines like `<0000><0000>`. (Required `<0000> <0000>` before.) Second fix is for documents that attach their Font objects to the Pages object instead of each Page object: - \Smalot\PdfParser\Page now has a setFonts() method - \Smalot\PdfParser\Pages now declares its $fonts variable - \Smalot\PdfParser\Pages::getPages() now applies the object's fonts to each child Page - \Smalot\PdfParser\Pages::getFonts() copied from Page class * Pages.php typo fix Added relative namespace for `ElementMissing` * Update src/Smalot/PdfParser/Page.php to make doc comment for setFonts() Co-authored-by: Konrad Abicht * Update Pages.php to make getFonts() protected Requested by @k00ni * Refined Pages instance new getFonts method not only returned stored fonts but also built related fonts list. With this changes its easier to test and the replacement (setupFonts) only builds the fonts list. Also refined some phpdoc comments. * Added PagesTest.php with two integration tests * added more information to PagesTest.php * PagesTest.php: fixed comment * PagesTest.php: added test case which doesn't depend on setFonts explicitly * PagesTest.php: fixed syntax error * PagesTest.php: deployed suggested test from @GreyWyvern Reference: https://github.com/smalot/pdfparser/pull/698#issuecomment-2112946865 --------- Co-authored-by: Konrad Abicht --- src/Smalot/PdfParser/Font.php | 2 +- src/Smalot/PdfParser/Page.php | 12 +++ src/Smalot/PdfParser/Pages.php | 60 +++++++++++++- tests/PHPUnit/Integration/PagesTest.php | 106 ++++++++++++++++++++++++ 4 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 tests/PHPUnit/Integration/PagesTest.php diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php index cfe85d79..af2d0f9f 100644 --- a/src/Smalot/PdfParser/Font.php +++ b/src/Smalot/PdfParser/Font.php @@ -191,7 +191,7 @@ public function loadTranslateTable(): array // Support for multiple bfchar sections if (preg_match_all('/beginbfchar(?P.*?)endbfchar/s', $content, $matches)) { foreach ($matches['sections'] as $section) { - $regexp = '/<(?P[0-9A-F]+)> +<(?P[0-9A-F]+)>[ \r\n]+/is'; + $regexp = '/<(?P[0-9A-F]+)> *<(?P[0-9A-F]+)>[ \r\n]+/is'; preg_match_all($regexp, $section, $matches); diff --git a/src/Smalot/PdfParser/Page.php b/src/Smalot/PdfParser/Page.php index d6ffaf08..0dafbfcc 100644 --- a/src/Smalot/PdfParser/Page.php +++ b/src/Smalot/PdfParser/Page.php @@ -54,6 +54,18 @@ class Page extends PDFObject */ protected $dataTm; + /** + * @param array<\Smalot\PdfParser\Font> $fonts + * + * @internal + */ + public function setFonts($fonts) + { + if (empty($this->fonts)) { + $this->fonts = $fonts; + } + } + /** * @return Font[] */ diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index 6b878650..f95134b1 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -40,7 +40,13 @@ class Pages extends PDFObject { /** - * @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them, + * @var array<\Smalot\PdfParser\Font>|null + */ + protected $fonts; + + /** + * @todo Objects other than Pages or Page might need to be treated specifically + * in order to get Page objects out of them. * * @see https://github.com/smalot/pdfparser/issues/331 */ @@ -57,6 +63,12 @@ public function getPages(bool $deep = false): array return $kidsElement->getContent(); } + // Prepare to apply the Pages' object's fonts to each page + if (false === \is_array($this->fonts)) { + $this->setupFonts(); + } + $fontsAvailable = 0 < \count($this->fonts); + $kids = $kidsElement->getContent(); $pages = []; @@ -64,10 +76,56 @@ public function getPages(bool $deep = false): array if ($kid instanceof self) { $pages = array_merge($pages, $kid->getPages(true)); } elseif ($kid instanceof Page) { + if ($fontsAvailable) { + $kid->setFonts($this->fonts); + } $pages[] = $kid; } } return $pages; } + + /** + * Gathers information about fonts and collects them in a list. + * + * @return void + * + * @internal + */ + protected function setupFonts() + { + $resources = $this->get('Resources'); + + if (method_exists($resources, 'has') && $resources->has('Font')) { + // no fonts available, therefore stop here + if ($resources->get('Font') instanceof Element\ElementMissing) { + return; + } + + if ($resources->get('Font') instanceof Header) { + $fonts = $resources->get('Font')->getElements(); + } else { + $fonts = $resources->get('Font')->getHeader()->getElements(); + } + + $table = []; + + foreach ($fonts as $id => $font) { + if ($font instanceof Font) { + $table[$id] = $font; + + // Store too on cleaned id value (only numeric) + $id = preg_replace('/[^0-9\.\-_]/', '', $id); + if ('' != $id) { + $table[$id] = $font; + } + } + } + + $this->fonts = $table; + } else { + $this->fonts = []; + } + } } diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php new file mode 100644 index 00000000..fb069c08 --- /dev/null +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -0,0 +1,106 @@ + + * + * @date 2024-04-19 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element\ElementArray; +use Smalot\PdfParser\Font; +use Smalot\PdfParser\Header; +use Smalot\PdfParser\Page; +use Smalot\PdfParser\Pages; + +/** + * @internal only for test purposes + */ +class PagesDummy extends Pages +{ + /** + * The purpose of this function is to bypass the tedious + * work to setup instances which lead to a valid $fonts variable. + * + * @param array<\Smalot\PdfParser\Font> $fonts + * + * @return void + */ + public function setFonts($fonts) + { + $this->fonts = $fonts; + } +} + +class PagesTest extends TestCase +{ + public function testFontsArePassedFromPagesToPage(): void + { + // Create mock Document, Font and Page objects + $document = $this->createMock(Document::class); + $font1 = new Font($document); + $page = new Page($document); + + // Create a Header object that indicates $page is a child + $header = new Header([ + 'Kids' => new ElementArray([ + $page, + ]), + ], $document); + + // Use this header to create a mock Pages object + $pages = new PagesDummy($document, $header); + + // Apply $font1 as a Font object to this Pages object; + // setFonts is used here as part of PagesDummy, only to access + // the protected Pages::fonts variable; it is not a method + // available in production + $pages->setFonts([$font1]); + + // Trigger setupFonts method in $pages + $pages->getPages(true); + + // Since the $page object font list is empty, $font1 from Pages + // object must be passed to the Page object + $this->assertEquals([$font1], $page->getFonts()); + + // Create a second $font2 using a different method + $font2 = $this->createMock(Font::class); + + // Update the fonts in $pages + $pages->setFonts([$font1, $font2]); + + // Trigger setupFonts method in $pages + $pages->getPages(true); + + // Now that $page already has a font, updates from $pages + // should not overwrite it + $this->assertEquals([$font1], $page->getFonts()); + } +}