Skip to content

Commit

Permalink
Fix for two bugs related to Unicode translation support by Font objec…
Browse files Browse the repository at this point in the history
…ts (#698)

* Fix for two bugs related to Unicode translation support by Font objects

Symptom was that some documents' contents was rendering as a bunch of
control characters.  These are the untranslated strings.  This was
happening because for two different reasons, these strings weren't being
translated \Smalot\PdfParser\Font::decodeContent() in some circumstances.

First fix is to \Smalot\PdfParser\Font::loadTranslateTable():

  - Fixed bug where bfchar sections weren't loaded due to mistake in regexp.
  - It now uses `*` instead of `+` and thus supports translation tables with
    lines like `<0000><0000>`.  (Required `<0000> <0000>` before.)

Second fix is for documents that attach their Font objects to the Pages
object instead of each Page object:

  - \Smalot\PdfParser\Page now has a setFonts() method
  - \Smalot\PdfParser\Pages now declares its $fonts variable
  - \Smalot\PdfParser\Pages::getPages() now applies the object's fonts to each child Page
  - \Smalot\PdfParser\Pages::getFonts() copied from Page class

* Pages.php typo fix

Added relative namespace for `ElementMissing`

* Update src/Smalot/PdfParser/Page.php to make doc comment for setFonts()

Co-authored-by: Konrad Abicht <[email protected]>

* Update Pages.php to make getFonts() protected

Requested by @k00ni

* Refined Pages instance

new getFonts method not only
returned stored fonts but also built related fonts
list. With this changes its easier to test and the
replacement (setupFonts) only builds the fonts list.
    
Also refined some phpdoc comments.

* Added PagesTest.php with two integration tests

* added more information to PagesTest.php

* PagesTest.php: fixed comment

* PagesTest.php: added test case which doesn't depend on setFonts explicitly

* PagesTest.php: fixed syntax error

* PagesTest.php: deployed suggested test from @GreyWyvern

Reference: #698 (comment)

---------

Co-authored-by: Konrad Abicht <[email protected]>
  • Loading branch information
unixnut and k00ni authored May 20, 2024
1 parent a19d555 commit 4b86c66
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Font.php
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ public function loadTranslateTable(): array
// Support for multiple bfchar sections
if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
foreach ($matches['sections'] as $section) {
$regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
$regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';

preg_match_all($regexp, $section, $matches);

Expand Down
12 changes: 12 additions & 0 deletions src/Smalot/PdfParser/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,18 @@ class Page extends PDFObject
*/
protected $dataTm;

/**
* @param array<\Smalot\PdfParser\Font> $fonts
*
* @internal
*/
public function setFonts($fonts)
{
if (empty($this->fonts)) {
$this->fonts = $fonts;
}
}

/**
* @return Font[]
*/
Expand Down
60 changes: 59 additions & 1 deletion src/Smalot/PdfParser/Pages.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,13 @@
class Pages extends PDFObject
{
/**
* @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them,
* @var array<\Smalot\PdfParser\Font>|null
*/
protected $fonts;

/**
* @todo Objects other than Pages or Page might need to be treated specifically
* in order to get Page objects out of them.
*
* @see https://github.com/smalot/pdfparser/issues/331
*/
Expand All @@ -57,17 +63,69 @@ public function getPages(bool $deep = false): array
return $kidsElement->getContent();
}

// Prepare to apply the Pages' object's fonts to each page
if (false === \is_array($this->fonts)) {
$this->setupFonts();
}
$fontsAvailable = 0 < \count($this->fonts);

$kids = $kidsElement->getContent();
$pages = [];

foreach ($kids as $kid) {
if ($kid instanceof self) {
$pages = array_merge($pages, $kid->getPages(true));
} elseif ($kid instanceof Page) {
if ($fontsAvailable) {
$kid->setFonts($this->fonts);
}
$pages[] = $kid;
}
}

return $pages;
}

/**
* Gathers information about fonts and collects them in a list.
*
* @return void
*
* @internal
*/
protected function setupFonts()
{
$resources = $this->get('Resources');

if (method_exists($resources, 'has') && $resources->has('Font')) {
// no fonts available, therefore stop here
if ($resources->get('Font') instanceof Element\ElementMissing) {
return;
}

if ($resources->get('Font') instanceof Header) {
$fonts = $resources->get('Font')->getElements();
} else {
$fonts = $resources->get('Font')->getHeader()->getElements();
}

$table = [];

foreach ($fonts as $id => $font) {
if ($font instanceof Font) {
$table[$id] = $font;

// Store too on cleaned id value (only numeric)
$id = preg_replace('/[^0-9\.\-_]/', '', $id);
if ('' != $id) {
$table[$id] = $font;
}
}
}

$this->fonts = $table;
} else {
$this->fonts = [];
}
}
}
106 changes: 106 additions & 0 deletions tests/PHPUnit/Integration/PagesTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
<?php

/**
* @file This file is part of the PdfParser library.
*
* @author Konrad Abicht <[email protected]>
*
* @date 2024-04-19
*
* @license LGPLv3
*
* @url <https://github.com/smalot/pdfparser>
*
* PdfParser is a pdf library written in PHP, extraction oriented.
* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program.
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
*/

namespace PHPUnitTests\Integration;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Element\ElementArray;
use Smalot\PdfParser\Font;
use Smalot\PdfParser\Header;
use Smalot\PdfParser\Page;
use Smalot\PdfParser\Pages;

/**
* @internal only for test purposes
*/
class PagesDummy extends Pages
{
/**
* The purpose of this function is to bypass the tedious
* work to setup instances which lead to a valid $fonts variable.
*
* @param array<\Smalot\PdfParser\Font> $fonts
*
* @return void
*/
public function setFonts($fonts)
{
$this->fonts = $fonts;
}
}

class PagesTest extends TestCase
{
public function testFontsArePassedFromPagesToPage(): void
{
// Create mock Document, Font and Page objects
$document = $this->createMock(Document::class);
$font1 = new Font($document);
$page = new Page($document);

// Create a Header object that indicates $page is a child
$header = new Header([
'Kids' => new ElementArray([
$page,
]),
], $document);

// Use this header to create a mock Pages object
$pages = new PagesDummy($document, $header);

// Apply $font1 as a Font object to this Pages object;
// setFonts is used here as part of PagesDummy, only to access
// the protected Pages::fonts variable; it is not a method
// available in production
$pages->setFonts([$font1]);

// Trigger setupFonts method in $pages
$pages->getPages(true);

// Since the $page object font list is empty, $font1 from Pages
// object must be passed to the Page object
$this->assertEquals([$font1], $page->getFonts());

// Create a second $font2 using a different method
$font2 = $this->createMock(Font::class);

// Update the fonts in $pages
$pages->setFonts([$font1, $font2]);

// Trigger setupFonts method in $pages
$pages->getPages(true);

// Now that $page already has a font, updates from $pages
// should not overwrite it
$this->assertEquals([$font1], $page->getFonts());
}
}

0 comments on commit 4b86c66

Please sign in to comment.