Skip to content

Commit

Permalink
Account for inaccurate offsets in getXrefData() (#692)
Browse files Browse the repository at this point in the history
* Account for inaccurate offsets in getXrefData()

Normally offset pointers to `xref` keywords in a PDF document are exact to the byte. However, in some cases the pointer may point to some whitespace *before* the `xref` keyword. Adobe Acrobat takes these 'errors' in stride, displaying the document anyway, and so should PdfParser.

Clean up the getXrefData() function in **RawDataParser.php**. It now only needs to do one `preg_match_all()` and pushes the caret past whitespace when looking for `xref` keywords.

Use existing **Issue557.pdf** to create a new file: **Issue673.pdf** where the last `/Prev 13486` command has been decremented to `/Prev 13485`. Trying to parse this file would cause an Exception without this fix.

* Drop unnecessary PREG_OFFSET_CAPTURE

No need to use `PREG_OFFSET_CAPTURE` here.

---------

Co-authored-by: Konrad Abicht <[email protected]>
  • Loading branch information
GreyWyvern and k00ni authored Apr 2, 2024
1 parent ed3fc0b commit fb77eab
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 26 deletions.
Binary file added samples/bugs/Issue673.pdf
Binary file not shown.
52 changes: 26 additions & 26 deletions src/Smalot/PdfParser/RawData/RawDataParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -864,39 +864,39 @@ private function getHeaderValue(?array $headerDic, string $key, string $type, $d
*/
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
{
$startxrefPreg = preg_match(
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
// If the $offset is currently pointed at whitespace, bump it
// forward until it isn't; affects loosely targetted offsets
// for the 'xref' keyword
// See: https://github.com/smalot/pdfparser/issues/673
$bumpOffset = $offset;
while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
++$bumpOffset;
}

// Find all startxref tables from this $offset forward
$startxrefPreg = preg_match_all(
'/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
$pdfData,
$matches,
\PREG_OFFSET_CAPTURE,
$startxrefMatches,
\PREG_SET_ORDER,
$offset
);

if (0 == $offset) {
// find last startxref
$pregResult = preg_match_all(
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
$pdfData,
$matches,
\PREG_SET_ORDER,
$offset
);
if (0 == $pregResult) {
throw new \Exception('Unable to find startxref');
}
$matches = array_pop($matches);
$startxref = $matches[1];
} elseif (strpos($pdfData, 'xref', $offset) == $offset) {
if (0 == $startxrefPreg) {
// No startxref tables were found
throw new \Exception('Unable to find startxref');
} elseif (0 == $offset) {
// Use the last startxref in the document
$startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
} elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
// Already pointing at the xref table
$startxref = $offset;
} elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
$startxref = $bumpOffset;
} elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
// Cross-Reference Stream object
$startxref = $offset;
} elseif ($startxrefPreg) {
// startxref found
$startxref = $matches[1][0];
$startxref = $bumpOffset;
} else {
throw new \Exception('Unable to find startxref');
// Use the next startxref from this $offset
$startxref = (int) $startxrefMatches[0][1];
}

if ($startxref > \strlen($pdfData)) {
Expand Down
22 changes: 22 additions & 0 deletions tests/PHPUnit/Integration/RawData/RawDataParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -172,4 +172,26 @@ public function testDecodeXrefStreamIssue479(): void
$this->assertArrayHasKey('Subject', $details);
$this->assertArrayHasKey('Title', $details);
}

/**
* Account for inaccurate offset values in getXrefData.
*
* Normally offset values extracted from the PDF document are exact.
* However in some cases, they may point to whitespace *before* a
* valid xref keyword. Move the offset forward past whitespace to
* make this function a little more lenient.
*
* @see https://github.com/smalot/pdfparser/issues/673
*/
public function testGetXrefDataIssue673(): void
{
$filename = $this->rootDir.'/samples/bugs/Issue673.pdf';

// Parsing this document would previously throw an Exception
$parser = $this->getParserInstance();
$document = $parser->parseFile($filename);
$text = $document->getText();

self::assertStringContainsString('6 rue des Goutais', $text);
}
}

0 comments on commit fb77eab

Please sign in to comment.