diff --git a/samples/bugs/Issue668.pdf b/samples/bugs/Issue668.pdf new file mode 100644 index 00000000..dc502543 Binary files /dev/null and b/samples/bugs/Issue668.pdf differ diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index 40973da0..8d1d14de 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -214,6 +214,16 @@ private function formatContent(?string $content): string return ''; } + // Outside of (String) content in PDF document streams, all + // text should conform to UTF-8. Test for binary content by + // deleting everything after the first open-parenthesis ( which + // indicates the beginning of a string. Then test what remains + // for valid UTF-8. If it's not UTF-8, return an empty string + // as this $content is most likely binary. + if (false === mb_check_encoding(preg_replace('/\(.*$/s', '', $content), 'UTF-8')) { + return ''; + } + // Find all strings () and replace them so they aren't affected // by the next steps $pdfstrings = []; @@ -261,17 +271,6 @@ private function formatContent(?string $content): string ); } - // Now that all strings and dictionaries are hidden, the only - // PDF commands left should all be plain text. - // Detect text encoding of the current string to prevent reading - // content streams that are images, etc. This prevents PHP - // error messages when JPEG content is sent to this function - // by the sample file '12249.pdf' from: - // https://github.com/smalot/pdfparser/issues/458 - if (false === mb_detect_encoding($content, null, true)) { - return ''; - } - // Normalize white-space in the document stream $content = preg_replace('/\s{2,}/', ' ', $content); diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php index 9ec68043..025c11b1 100644 --- a/tests/PHPUnit/Integration/PDFObjectTest.php +++ b/tests/PHPUnit/Integration/PDFObjectTest.php @@ -274,6 +274,16 @@ public function testFormatContent(): void $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); $this->assertEquals('', $cleaned); + + // See: https://github.com/smalot/pdfparser/issues/668 + $filename = $this->rootDir.'/samples/bugs/Issue668.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + // Binary check is done before a regexp that causes an error + $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText()); } public function testGetSectionsText(): void