From 051ec84efdad09d72946c354f1ffe15b2eebccc3 Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Thu, 28 Sep 2023 06:40:42 -0400 Subject: [PATCH] Absorb spaces after 'stream' declarations (#642) There looks to be some additional issues with fonts remaining, but the text content is now read by getText(). --- .../PdfParser/RawData/RawDataParser.php | 2 +- .../Integration/RawData/RawDataParserTest.php | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ec8f600b..77630897 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -756,7 +756,7 @@ protected function getRawObject(string $pdfData, int $offset = 0, array $headerD // start stream object $objtype = 'stream'; $offset += 6; - if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) { + if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) { $offset += \strlen($matches[0]); // we get stream length here to later help preg_match test less data diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index 17c64d30..dec70977 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -87,6 +87,25 @@ public function testGetRawObjectIssue372(): void ], $result ); + + // Test that spaces after a 'stream' declaration are absorbed + // See: https://github.com/smalot/pdfparser/issues/641 + $data = 'stream '."\n"; + $data .= 'streamdata'."\n"; + $data .= 'endstream'."\n"; + $data .= 'endobj'; + + $result = $this->fixture->exposeGetRawObject($data); + + // Value 'streamdata'."\n" would be empty string without the fix + $this->assertEquals( + [ + 'stream', + 'streamdata'."\n", + 19, + ], + $result + ); } /**