Skip to content

Commit

Permalink
Absorb spaces after 'stream' declarations (#642)
Browse files Browse the repository at this point in the history
There looks to be some additional issues with fonts remaining, but the text content is now read by getText().
  • Loading branch information
GreyWyvern authored Sep 28, 2023
1 parent 5c48261 commit 051ec84
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/RawData/RawDataParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ protected function getRawObject(string $pdfData, int $offset = 0, array $headerD
// start stream object
$objtype = 'stream';
$offset += 6;
if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
$offset += \strlen($matches[0]);

// we get stream length here to later help preg_match test less data
Expand Down
19 changes: 19 additions & 0 deletions tests/PHPUnit/Integration/RawData/RawDataParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,25 @@ public function testGetRawObjectIssue372(): void
],
$result
);

// Test that spaces after a 'stream' declaration are absorbed
// See: https://github.com/smalot/pdfparser/issues/641
$data = 'stream '."\n";
$data .= 'streamdata'."\n";
$data .= 'endstream'."\n";
$data .= 'endobj';

$result = $this->fixture->exposeGetRawObject($data);

// Value 'streamdata'."\n" would be empty string without the fix
$this->assertEquals(
[
'stream',
'streamdata'."\n",
19,
],
$result
);
}

/**
Expand Down

0 comments on commit 051ec84

Please sign in to comment.