smalot · k00ni · Feb 12, 2021 · Feb 9, 2021 · Feb 9, 2021 · Feb 9, 2021
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
@@ -141,6 +141,36 @@ jobs:
       - name: "Run PHPUnit"
         run: "php vendor/bin/simple-phpunit -v"
 
+  alt-autoload:
+    name: "Tests alternative autoloader (PHP ${{ matrix.php }})"
+    runs-on: "ubuntu-20.04"
+
+    env:
+      SYMFONY_PHPUNIT_VERSION: 7.5
+
+    strategy:
+      matrix:
+        php:
+          - "5.6"
+          - "7.0"
+          - "7.1"
+          - "7.2"
+          - "7.3"
+          - "7.4"
+
+    steps:
+      - name: "Checkout"
+        uses: "actions/checkout@v2"
+
+      - name: "Install PHP"
+        uses: "shivammathur/setup-php@v2"
+        with:
+          php-version: "${{ matrix.php }}"
+          coverage: "none"
+
+      - name: "Test alt-autoload"
+        run: "php tests/AltAutoloadTest.php"
+
   phpunit-lowest:
     name: "PHPUnit lowest deps (PHP ${{ matrix.php }})"
     runs-on: "ubuntu-20.04"

diff --git a/README.md b/README.md
@@ -42,6 +42,19 @@ As a result, users must expect BC breaks when using the master version.
 
 Original PDF References files can be downloaded from this url: http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
 
+## Installation
+
+### Using Composer
+
+* Obtain [Composer](https://getcomposer.org)
+* Run `composer require smalot/pdfparser`
+
+### Use alternate file loader
+
+In case you can't use Composer, you can include `alt_autoload.php-dist` into your project.
+It will load all required files at once.
+Afterwards you can use `PDFParser` class and others.
+
 ## License ##
 
 This library is under the [LGPLv3 license](https://github.com/smalot/pdfparser/blob/master/LICENSE.txt).
diff --git a/alt_autoload.php-dist b/alt_autoload.php-dist
@@ -0,0 +1,74 @@
+<?php
+
+/**
+ * @file This file is part of the PdfParser library.
+ *
+ * @author  Konrad Abicht <[email protected]>
+ * @date    2021-02-09
+ *
+ * @license LGPLv3
+ * @url     <https://github.com/smalot/pdfparser>
+ *
+ *  PdfParser is a pdf library written in PHP, extraction oriented.
+ *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.
+ *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
+ *
+ * --------------------------------------------------------------------------------------
+ *
+ * About:
+ * This file provides an alternative to the Composer-approach.
+ * Include it into your project and all required files of PDFParser will be loaded automatically.
+ * Please use it only, if Composer is not available.
+ *
+ * How to use:
+ * 1. include this file as it is OR copy and rename it as you like (and then include it)
+ * 2. afterwards you can use PDFParser classes
+ * Done.
+ */
+
+/**
+ * Loads all files found in a given folder.
+ * Calls itself recursively for all sub folders.
+ *
+ * @param string $dir
+ */
+function requireFilesOfFolder($dir)
+{
+    foreach (new DirectoryIterator($dir) as $fileInfo) {
+        if (!$fileInfo->isDot()) {
+            if ($fileInfo->isDir()) {
+                requireFilesOfFolder($fileInfo->getPathname());
+            } else {
+                require_once $fileInfo->getPathname();
+            }
+        }
+    }
+}
+
+$rootFolder = __DIR__.'/src/Smalot/PdfParser';
+
+// Manually require files, which can't be loaded automatically that easily.
+require_once $rootFolder.'/Element.php';
+require_once $rootFolder.'/PDFObject.php';
+require_once $rootFolder.'/Font.php';
+require_once $rootFolder.'/Page.php';
+require_once $rootFolder.'/Element/ElementString.php';
+
+/*
+ * Load the rest of PDFParser files from /src/Smalot/PDFParser
+ * Dont worry, it wont load files multiple times.
+ */
+requireFilesOfFolder($rootFolder);
diff --git a/src/Smalot/PdfParser/Element/ElementHexa.php b/src/Smalot/PdfParser/Element/ElementHexa.php
@@ -75,7 +75,7 @@ public static function decode($value, Document $document = null)
         if ('00' === substr($value, 0, 2)) {
             for ($i = 0; $i < $length; $i += 4) {
                 $hex = substr($value, $i, 4);
-                $text .= '&#'.str_pad(hexdec($hex), 4, '0', STR_PAD_LEFT).';';
+                $text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';';
             }
         } else {
             for ($i = 0; $i < $length; $i += 2) {
@@ -84,7 +84,7 @@ public static function decode($value, Document $document = null)
             }
         }
 
-        $text = html_entity_decode($text, ENT_NOQUOTES, 'UTF-8');
+        $text = html_entity_decode($text, \ENT_NOQUOTES, 'UTF-8');
 
         return $text;
     }

diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php
@@ -175,7 +175,7 @@ public function loadTranslateTable()
                             '/([0-9A-F]{4})/i',
                             $matches['to'][$key],
                             0,
-                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
+                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
                         );
                         $text = '';
                         foreach ($parts as $part) {
@@ -221,7 +221,7 @@ public function loadTranslateTable()
                                 '/([0-9A-F]{4})/i',
                                 $string,
                                 0,
-                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
+                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
                             );
                             $text = '';
                             foreach ($parts as $part) {
@@ -259,7 +259,7 @@ public static function decodeHexadecimal($hexa, $add_braces = false)
         }
 
         $text = '';
-        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
+        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
 
         foreach ($parts as $part) {
             if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
@@ -291,7 +291,7 @@ public static function decodeHexadecimal($hexa, $add_braces = false)
      */
     public static function decodeOctal($text)
     {
-        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
+        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
         $text = '';
 
         foreach ($parts as $part) {
@@ -312,7 +312,7 @@ public static function decodeOctal($text)
      */
     public static function decodeEntities($text)
     {
-        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
+        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
         $text = '';
 
         foreach ($parts as $part) {
@@ -470,7 +470,7 @@ public function decodeContent($text, &$unicode = null)
                         '//s'.($unicode ? 'u' : ''),
                         $text,
                         -1,
-                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
+                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
                 );
 
                 foreach ($chars as $char) {

diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php
@@ -147,25 +147,25 @@ public function cleanContent($content, $char = 'X')
         $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
 
         // Remove image bloc with binary content
-        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, PREG_OFFSET_CAPTURE);
+        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
         foreach ($matches[0] as $part) {
             $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
         }
 
         // Clean content in square brackets [.....]
-        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, PREG_OFFSET_CAPTURE);
+        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
         foreach ($matches[1] as $part) {
             $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
         }
 
         // Clean content in round brackets (.....)
-        preg_match_all('/\((.*?)\)/s', $content, $matches, PREG_OFFSET_CAPTURE);
+        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
         foreach ($matches[1] as $part) {
             $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
         }
 
         // Clean structure
-        if ($parts = preg_split('/(<|>)/s', $content, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE)) {
+        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
             $content = '';
             $level = 0;
             foreach ($parts as $part) {
@@ -186,13 +186,13 @@ public function cleanContent($content, $char = 'X')
             '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
             $content,
             $matches,
-            PREG_OFFSET_CAPTURE
+            \PREG_OFFSET_CAPTURE
         );
         foreach ($matches[1] as $part) {
             $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
         }
 
-        preg_match_all('/\s(EMC)\s/s', $content, $matches, PREG_OFFSET_CAPTURE);
+        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
         foreach ($matches[1] as $part) {
             $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
         }
@@ -212,7 +212,7 @@ public function getSectionsText($content)
         $textCleaned = $this->cleanContent($content, '_');
 
         // Extract text blocks.
-        if (preg_match_all('/\s+BT[\s|\(|\[]+(.*?)\s*ET/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) {
+        if (preg_match_all('/\s+BT[\s|\(|\[]+(.*?)\s*ET/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) {
             foreach ($matches[1] as $part) {
                 $text = $part[0];
                 if ('' === $text) {
@@ -229,7 +229,7 @@ public function getSectionsText($content)
         }
 
         // Extract 'do' commands.
-        if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) {
+        if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) {
             foreach ($matches[1] as $part) {
                 $text = $part[0];
                 $offset = $part[1];

diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php
@@ -199,7 +199,7 @@ protected function parseObject($id, $structure, $document)
                             '/(\d+\s+\d+\s*)/s',
                             $match[1],
                             -1,
-                          PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
+                          \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
                         );
                         $table = [];
 

diff --git a/src/Smalot/PdfParser/RawData/FilterHelper.php b/src/Smalot/PdfParser/RawData/FilterHelper.php
@@ -231,7 +231,7 @@ protected function decodeFilterFlateDecode($data)
          * the following set_error_handler changes an E_WARNING to an E_ERROR, which is catchable.
          */
         set_error_handler(function ($errNo, $errStr) {
-            if (E_WARNING === $errNo) {
+            if (\E_WARNING === $errNo) {
                 throw new Exception($errStr);
             } else {
                 // fallback to default php error handler

diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php
@@ -153,7 +153,7 @@ protected function decodeXref($pdfData, $startxref, $xref = [])
         // initialize object number
         $obj_num = 0;
         // search for cross-reference entries or subsection
-        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
+        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
             if ($matches[0][1] != $offset) {
                 // we are on another section
                 break;
@@ -176,7 +176,7 @@ protected function decodeXref($pdfData, $startxref, $xref = [])
             }
         }
         // get trailer data
-        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
+        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
             $trailer_data = $matches[1][0];
             if (!isset($xref['trailer']) || empty($xref['trailer'])) {
                 // get only the last updated version
@@ -717,7 +717,7 @@ protected function getRawObject($pdfData, $offset = 0)
                                 '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
                                 substr($pdfData, $offset),
                                 $matches,
-                                PREG_OFFSET_CAPTURE
+                                \PREG_OFFSET_CAPTURE
                             );
                             if (1 == $pregResult) {
                                 $objval = substr($pdfData, $offset, $matches[0][1]);
@@ -768,7 +768,7 @@ protected function getXrefData($pdfData, $offset = 0, $xref = [])
             '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
             $pdfData,
             $matches,
-            PREG_OFFSET_CAPTURE,
+            \PREG_OFFSET_CAPTURE,
             $offset
         );
 
@@ -777,7 +777,7 @@ protected function getXrefData($pdfData, $offset = 0, $xref = [])
             $pregResult = preg_match_all(
                 '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
                 $pdfData, $matches,
-                PREG_SET_ORDER,
+                \PREG_SET_ORDER,
                 $offset
             );
             if (0 == $pregResult) {
@@ -788,7 +788,7 @@ protected function getXrefData($pdfData, $offset = 0, $xref = [])
         } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
             // Already pointing at the xref table
             $startxref = $offset;
-        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset)) {
+        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
             // Cross-Reference Stream object
             $startxref = $offset;
         } elseif ($startxrefPreg) {

diff --git a/tests/AltAutoloadTest.php b/tests/AltAutoloadTest.php
@@ -0,0 +1,41 @@
+<?php
+
+/**
+ * @file This file is part of the PdfParser library.
+ *
+ * @author  Konrad Abicht <[email protected]>
+ * @date    2021-02-09
+ *
+ * @license LGPLv3
+ * @url     <https://github.com/smalot/pdfparser>
+ *
+ *  PdfParser is a pdf library written in PHP, extraction oriented.
+ *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.
+ *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
+ */
+require __DIR__.'/../alt_autoload.php-dist';
+
+$parser = new Smalot\PdfParser\Parser();
+
+$filename = __DIR__.'/../samples/InternationalChars.pdf';
+$document = $parser->parseFile($filename);
+
+$needle = 'Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте.';
+if (0 !== strpos($document->getText(), $needle)) {
+    return 0;
+}
+
+throw new Exception('Something went wrong. Alt-Autoload is not working.');