Skip to content

Commit

Permalink
Enable PDFDocEncoding support for metadata (#611)
Browse files Browse the repository at this point in the history
* Enable PDFDocEncoding support

Regular PDF metadata (outside of XMP), depending on the characters it includes, can be encoded in UTF-8 escaped (or binary) bytes, or using a proprietary Adobe encoding PDFDocEncoding which is similar to, but not exactly like CP1252.

For more information on the PDFDocEncoding character set, see: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf

Another issue is that regardless of the storage encoding used, Adobe Acrobat will attempt to add a slash-linefeed (\r) to metadata text to avoid long line-lengths (~127 bytes) in the saved PDF data. Unfortunately, the method to do this does not seem binary-safe, resulting in UTF-8 saved bytes that are destroyed and must be repaired.

This commit enables decoding PDF metadata using PDFDocEncoding, and also repairs added line-feeds in both PDFDocEncoding and UTF-8.

It also adds a sample file "Issue609.pdf" containing both UTF-8 and PDFDocEncoding encoded metadata fields for testing. The name of the file references PDFParser issue #609: #609

* Update PDFDocEncoding.php

I hope I am not assuming too much by adding myself as the author of this file!

* PR #611 suggested changes

Add comments in Document.php
Use plain class PDFDocEncoding, do not extend AbstractEncoding
array() => []
Break up class functions into one that returns the code table, and another that uses the table to perform the conversion

* fixed coding style issues in Document.php

* fixed coding style issue in PDFDocEncoding.php

---------

Co-authored-by: Konrad Abicht <[email protected]>
  • Loading branch information
GreyWyvern and k00ni authored Jul 11, 2023
1 parent 66ddf47 commit d03ef96
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 0 deletions.
Binary file added samples/Issue609.pdf
Binary file not shown.
43 changes: 43 additions & 0 deletions src/Smalot/PdfParser/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@

namespace Smalot\PdfParser;

use Smalot\PdfParser\Encoding\PDFDocEncoding;

/**
* Technical references :
* - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
Expand Down Expand Up @@ -149,6 +151,47 @@ protected function buildDetails()
$details['Pages'] = 0;
}

// Decode and repair encoded document properties
foreach ($details as $key => $value) {
if (\is_string($value)) {
// If the string is already UTF-8 encoded, that means we only
// need to repair Adobe's ham-fisted insertion of line-feeds
// every ~127 characters, which doesn't seem to be multi-byte
// safe
if (mb_check_encoding($value, 'UTF-8')) {
// Remove literal backslash + line-feed "\\r"
$value = str_replace("\x5c\x0d", '', $value);

// Remove backslash plus bytes written into high part of
// multibyte unicode character
while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
$diff = (\ord($match[1]) - 182) * 64;
$newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
$value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
}

// Remove bytes written into low part of multibyte unicode
// character
while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
$diff = \ord($match[2]) - 181;
$newbyte = \chr(\ord($match[1]) + $diff);
$value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
}

// Remove this byte string that Adobe occasionally adds
// between two single byte characters in a unicode string
$value = str_replace("\xe5\xb0\x8d", '', $value);

$details[$key] = $value;
} else {
// If the string is just PDFDocEncoding, remove any line-feeds
// and decode the whole thing.
$value = str_replace("\\\r", '', $value);
$details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
}
}
}

$details = array_merge($details, $this->metadata);

$this->details = $details;
Expand Down
189 changes: 189 additions & 0 deletions src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
<?php

/**
* @file This file is part of the PdfParser library.
*
* @author Brian Huisman <[email protected]>
*
* @date 2023-06-28
*
* @license LGPLv3
*
* @url <https://github.com/smalot/pdfparser>
*
* PdfParser is a pdf library written in PHP, extraction oriented.
* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program.
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
*/

// Source : https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.2.pdf
// Source : https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf

namespace Smalot\PdfParser\Encoding;

/**
* Class PDFDocEncoding
*/
class PDFDocEncoding
{
public static function getCodePage(): array
{
return [
"\x18" => "\u{02d8}", // breve
"\x19" => "\u{02c7}", // caron
"\x1a" => "\u{02c6}", // circumflex
"\x1b" => "\u{02d9}", // dotaccent
"\x1c" => "\u{02dd}", // hungarumlaut
"\x1d" => "\u{02db}", // ogonek
"\x1e" => "\u{02de}", // ring
"\x1f" => "\u{02dc}", // tilde
"\x7f" => '',
"\x80" => "\u{2022}", // bullet
"\x81" => "\u{2020}", // dagger
"\x82" => "\u{2021}", // daggerdbl
"\x83" => "\u{2026}", // ellipsis
"\x84" => "\u{2014}", // emdash
"\x85" => "\u{2013}", // endash
"\x86" => "\u{0192}", // florin
"\x87" => "\u{2044}", // fraction
"\x88" => "\u{2039}", // guilsinglleft
"\x89" => "\u{203a}", // guilsinglright
"\x8a" => "\u{2212}", // minus
"\x8b" => "\u{2030}", // perthousand
"\x8c" => "\u{201e}", // quotedblbase
"\x8d" => "\u{201c}", // quotedblleft
"\x8e" => "\u{201d}", // quotedblright
"\x8f" => "\u{2018}", // quoteleft
"\x90" => "\u{2019}", // quoteright
"\x91" => "\u{201a}", // quotesinglbase
"\x92" => "\u{2122}", // trademark
"\x93" => "\u{fb01}", // fi
"\x94" => "\u{fb02}", // fl
"\x95" => "\u{0141}", // Lslash
"\x96" => "\u{0152}", // OE
"\x97" => "\u{0160}", // Scaron
"\x98" => "\u{0178}", // Ydieresis
"\x99" => "\u{017d}", // Zcaron
"\x9a" => "\u{0131}", // dotlessi
"\x9b" => "\u{0142}", // lslash
"\x9c" => "\u{0153}", // oe
"\x9d" => "\u{0161}", // scaron
"\x9e" => "\u{017e}", // zcaron
"\x9f" => '',
"\xa0" => "\u{20ac}", // Euro
"\xa1" => "\u{00a1}", // exclamdown
"\xa2" => "\u{00a2}", // cent
"\xa3" => "\u{00a3}", // sterling
"\xa4" => "\u{00a4}", // currency
"\xa5" => "\u{00a5}", // yen
"\xa6" => "\u{00a6}", // brokenbar
"\xa7" => "\u{00a7}", // section
"\xa8" => "\u{00a8}", // dieresis
"\xa9" => "\u{00a9}", // copyright
"\xaa" => "\u{00aa}", // ordfeminine
"\xab" => "\u{00ab}", // guillemotleft
"\xac" => "\u{00ac}", // logicalnot
"\xad" => '',
"\xae" => "\u{00ae}", // registered
"\xaf" => "\u{00af}", // macron
"\xb0" => "\u{00b0}", // degree
"\xb1" => "\u{00b1}", // plusminus
"\xb2" => "\u{00b2}", // twosuperior
"\xb3" => "\u{00b3}", // threesuperior
"\xb4" => "\u{00b4}", // acute
"\xb5" => "\u{00b5}", // mu
"\xb6" => "\u{00b6}", // paragraph
"\xb7" => "\u{00b7}", // periodcentered
"\xb8" => "\u{00b8}", // cedilla
"\xb9" => "\u{00b9}", // onesuperior
"\xba" => "\u{00ba}", // ordmasculine
"\xbb" => "\u{00bb}", // guillemotright
"\xbc" => "\u{00bc}", // onequarter
"\xbd" => "\u{00bd}", // onehalf
"\xbe" => "\u{00be}", // threequarters
"\xbf" => "\u{00bf}", // questiondown
"\xc0" => "\u{00c0}", // Agrave
"\xc1" => "\u{00c1}", // Aacute
"\xc2" => "\u{00c2}", // Acircumflex
"\xc3" => "\u{00c3}", // Atilde
"\xc4" => "\u{00c4}", // Adieresis
"\xc5" => "\u{00c5}", // Aring
"\xc6" => "\u{00c6}", // AE
"\xc7" => "\u{00c7}", // Ccedill
"\xc8" => "\u{00c8}", // Egrave
"\xc9" => "\u{00c9}", // Eacute
"\xca" => "\u{00ca}", // Ecircumflex
"\xcb" => "\u{00cb}", // Edieresis
"\xcc" => "\u{00cc}", // Igrave
"\xcd" => "\u{00cd}", // Iacute
"\xce" => "\u{00ce}", // Icircumflex
"\xcf" => "\u{00cf}", // Idieresis
"\xd0" => "\u{00d0}", // Eth
"\xd1" => "\u{00d1}", // Ntilde
"\xd2" => "\u{00d2}", // Ograve
"\xd3" => "\u{00d3}", // Oacute
"\xd4" => "\u{00d4}", // Ocircumflex
"\xd5" => "\u{00d5}", // Otilde
"\xd6" => "\u{00d6}", // Odieresis
"\xd7" => "\u{00d7}", // multiply
"\xd8" => "\u{00d8}", // Oslash
"\xd9" => "\u{00d9}", // Ugrave
"\xda" => "\u{00da}", // Uacute
"\xdb" => "\u{00db}", // Ucircumflex
"\xdc" => "\u{00dc}", // Udieresis
"\xdd" => "\u{00dd}", // Yacute
"\xde" => "\u{00de}", // Thorn
"\xdf" => "\u{00df}", // germandbls
"\xe0" => "\u{00e0}", // agrave
"\xe1" => "\u{00e1}", // aacute
"\xe2" => "\u{00e2}", // acircumflex
"\xe3" => "\u{00e3}", // atilde
"\xe4" => "\u{00e4}", // adieresis
"\xe5" => "\u{00e5}", // aring
"\xe6" => "\u{00e6}", // ae
"\xe7" => "\u{00e7}", // ccedilla
"\xe8" => "\u{00e8}", // egrave
"\xe9" => "\u{00e9}", // eacute
"\xea" => "\u{00ea}", // ecircumflex
"\xeb" => "\u{00eb}", // edieresis
"\xec" => "\u{00ec}", // igrave
"\xed" => "\u{00ed}", // iacute
"\xee" => "\u{00ee}", // icircumflex
"\xef" => "\u{00ef}", // idieresis
"\xf0" => "\u{00f0}", // eth
"\xf1" => "\u{00f1}", // ntilde
"\xf2" => "\u{00f2}", // ograve
"\xf3" => "\u{00f3}", // oacute
"\xf4" => "\u{00f4}", // ocircumflex
"\xf5" => "\u{00f5}", // otilde
"\xf6" => "\u{00f6}", // odieresis
"\xf7" => "\u{00f7}", // divide
"\xf8" => "\u{00f8}", // oslash
"\xf9" => "\u{00f9}", // ugrave
"\xfa" => "\u{00fa}", // uacute
"\xfb" => "\u{00fb}", // ucircumflex
"\xfc" => "\u{00fc}", // udieresis
"\xfd" => "\u{00fd}", // yacute
"\xfe" => "\u{00fe}", // thorn
"\xff" => "\u{00ff}", // ydieresis
];
}

public static function convertPDFDoc2UTF8(string $content): string
{
return strtr($content, static::getCodePage());
}
}
27 changes: 27 additions & 0 deletions tests/PHPUnit/Integration/DocumentTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -271,4 +271,31 @@ public function testExtractXMPMetadata(): void
// Metadata.
self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
}

/**
* Tests PDFDocEncoding decode of Document Properties
*
* @see https://github.com/smalot/pdfparser/issues/609
*/
public function testPDFDocEncodingDecode(): void
{
$document = (new Parser())->parseFile($this->rootDir.'/samples/Issue609.pdf');

$details = $document->getDetails();

// These test that Adobe-inserted \r are removed from a UTF-8
// escaped metadata string, and the surrounding characters are
// repaired
$testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™fiflŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
self::assertStringContainsString($testKeywords, $details['Keywords']);

$testKeywords = 'added line-feeds often destroy multibyte characters';
self::assertStringContainsString($testKeywords, $details['Keywords']);

// This tests that the PDFDocEncoding characters that differ
// from CP-1252 are decoded to their correct UTF-8 code points
// as well as removing \r line-feeds
$testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž';
self::assertStringContainsString($testSubject, $details['Subject']);
}
}

0 comments on commit d03ef96

Please sign in to comment.