diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b138d5c..21062cc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - php-versions: ['7.4', '8.0'] + php-versions: ['7.4', '8.0', '8.1', '8.2'] steps: - name: Checkout code diff --git a/.php-cs-fixer.dist.php b/.php-cs-fixer.dist.php index 99e8f71..1ca9e33 100644 --- a/.php-cs-fixer.dist.php +++ b/.php-cs-fixer.dist.php @@ -5,9 +5,11 @@ $config = new PhpCsFixer\Config(); -return $config->setRules([ +return $config->setFinder($finder) + ->setRules([ '@PSR12' => true, 'strict_param' => true, 'array_syntax' => ['syntax' => 'short'], ]) - ->setFinder($finder); + ->setRiskyAllowed(true) + ->setUsingCache(true); diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a6ac16..9f94305 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.0.0] - 2022-09-22 +### Changed +- Required PHP version is now 8.0. + +### Added +- It now also parses `Sitemap:` lines. You can get all referenced sitemaps via the `sitemaps()` method of the `RobotsTxt` class. + ## [0.1.2] - 2022-09-16 ### Fixed - Also allow usage of crwlr/url 1.0 as it's not a problem at all and the PHP version requirement of this package is still `^7.4|^8.0`. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3b1a66b..a829792 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -37,7 +37,7 @@ Linting can be executed using the `composer cs` command. When you're making changes to this package please always run unit tests, CS Fixer and PHPStan. Commands: `composer test` -`composer cs` +`composer cs` or `composer cs-fix` `composer stan` Ideally you add the pre-commit git hook that is shipped with @@ -45,7 +45,7 @@ this repo that will run tests and linting. Add it to your local clone by running: `composer add-git-hooks` -Also please don't forget to add new test cases if necessary. +Also, please don't forget to add new test cases if necessary. ### Documentation @@ -56,5 +56,5 @@ For any code change please don't forget to add an entry to the ## Appreciation When your pull request is merged I will show some love and tweet -about it. Also if you meet me in person I will be glad to buy you +about it. Also, if you meet me in person I will be glad to buy you a beer. diff --git a/LICENSE b/LICENSE index 00652ac..61fc7e1 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2021 Christian Olear +Copyright (c) 2022 Christian Olear Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/README.md b/README.md index 54ec1cd..84d4976 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +

crwlr.software logo

+ # Robots Exclusion Standard/Protocol Parser ## for Web Crawling/Scraping @@ -5,34 +7,9 @@ Use this library within crawler/scraper programs to parse robots.txt files and check if your crawler user-agent is allowed to load certain paths. -## Requirements - -Requires PHP version 7.4 or above. - -## Installation - -Install the latest version with: - -```sh -composer require crwlr/robots-txt -``` - -## Usage - -```php -use Crwlr\RobotsTxt\RobotsTxt; - -$robotsTxtContent = file_get_contents('https://www.crwlr.software/robots.txt'); -$robotsTxt = RobotsTxt::parse($robotsTxtContent); - -$robotsTxt->isAllowed('/packages', 'MyBotName'); -``` +## Documentation +You can find the documentation at [crwlr.software](https://www.crwlr.software/packages/robots-txt/getting-started). -You can also check with an absolute url. -But attention: the library won't (/can't) check if the host of your -absolute url is the same as the robots.txt file was on (because it -doesn't know the host where it's on, you just give it the content). +## Contributing -```php -$robotsTxt->isAllowed('https://www.crwlr.software/packages', 'MyBotName'); -``` +If you consider contributing something to this package, read the [contribution guide (CONTRIBUTING.md)](CONTRIBUTING.md). diff --git a/composer.json b/composer.json index 74f31b5..ed02e17 100644 --- a/composer.json +++ b/composer.json @@ -31,7 +31,7 @@ "docs": "https://www.crwlr.software/packages/robots-txt" }, "require": { - "php": "^7.4|^8.0", + "php": "^8.0", "crwlr/url": "^1.0|^2.0" }, "require-dev": { @@ -48,7 +48,8 @@ }, "scripts": { "test": "@php vendor/bin/phpunit", - "cs": "PHP_CS_FIXER_IGNORE_ENV=1 php vendor/bin/php-cs-fixer fix -v --diff --dry-run --allow-risky=yes", + "cs": "@php vendor/bin/php-cs-fixer fix -v --diff --dry-run", + "cs-fix": "@php vendor/bin/php-cs-fixer fix -v", "stan": "@php vendor/bin/phpstan analyse -c phpstan.neon", "add-git-hooks": "@php bin/add-git-hooks" } diff --git a/src/Parser.php b/src/Parser.php index 7f92b37..c93ff29 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -7,14 +7,12 @@ final class Parser { /** - * @param string $robotsTxtContent - * @return RobotsTxt * @throws InvalidRobotsTxtFileException */ public function parse(string $robotsTxtContent): RobotsTxt { $lines = explode("\n", $robotsTxtContent); - $userAgentGroups = []; + $userAgentGroups = $sitemaps = []; for ($lineNumber = 0; $lineNumber < count($lines); $lineNumber++) { $line = $this->getLine($lines, $lineNumber); @@ -31,15 +29,17 @@ public function parse(string $robotsTxtContent): RobotsTxt $this->addRuleToUserAgentGroup($line, $userAgentGroup); } + + if ($this->isSitemapLine($line)) { + $sitemaps[] = $this->getSitemapFromLine($line); + } } - return new RobotsTxt($userAgentGroups); + return new RobotsTxt($userAgentGroups, $sitemaps); } /** * @param string[] $lines - * @param int $lineNumber - * @return string */ private function getLine(array $lines, int $lineNumber): string { @@ -47,9 +47,7 @@ private function getLine(array $lines, int $lineNumber): string } /** - * @param array|string[] $lines - * @param int $lineNumber - * @return string|null + * @param string[] $lines */ private function getNextLine(array $lines, int $lineNumber): ?string { @@ -70,6 +68,11 @@ private function isRuleLine(string $line): bool return $this->isDisallowLine($line) || $this->isAllowLine($line); } + private function isSitemapLine(string $line): bool + { + return preg_match('/^\s?sitemap\s?:/i', $line) === 1; + } + private function isDisallowLine(string $line): bool { return preg_match('/^\s?disallow\s?:/i', $line) === 1; @@ -81,7 +84,7 @@ private function isAllowLine(string $line): bool } /** - * @param array|string[] $lines + * @param string[] $lines */ private function makeUserAgentGroup(array $lines, string $line, int &$lineNumber): UserAgentGroup { @@ -112,15 +115,16 @@ private function addRuleToUserAgentGroup(string $line, UserAgentGroup $userAgent } } - /** - * @param string $line - * @return string - */ private function getUserAgentFromLine(string $line): string { return $this->getStringAfterFirstColon($line); } + private function getSitemapFromLine(string $line): string + { + return $this->getStringAfterFirstColon($line); + } + private function getPatternFromRuleLine(string $line): string { $lineAfterFirstColon = $this->getStringAfterFirstColon($line); diff --git a/src/RobotsTxt.php b/src/RobotsTxt.php index 99e0e33..fd04993 100644 --- a/src/RobotsTxt.php +++ b/src/RobotsTxt.php @@ -3,19 +3,16 @@ namespace Crwlr\RobotsTxt; use Crwlr\RobotsTxt\Exceptions\InvalidRobotsTxtFileException; +use Exception; use InvalidArgumentException; final class RobotsTxt { /** - * @var array|UserAgentGroup[] + * @param UserAgentGroup[] $userAgentGroups + * @param string[] $sitemaps */ - private array $userAgentGroups = []; - - /** - * @param array|UserAgentGroup[] $userAgentGroups - */ - public function __construct(array $userAgentGroups) + public function __construct(private array $userAgentGroups, private array $sitemaps = []) { foreach ($userAgentGroups as $userAgentGroup) { if (!$userAgentGroup instanceof UserAgentGroup) { @@ -24,8 +21,6 @@ public function __construct(array $userAgentGroups) ); } } - - $this->userAgentGroups = $userAgentGroups; } /** @@ -37,13 +32,24 @@ public static function parse(string $robotsTxtContent): RobotsTxt } /** - * @return array|UserAgentGroup[] + * @return UserAgentGroup[] */ public function groups(): array { return $this->userAgentGroups; } + /** + * @return string[] + */ + public function sitemaps(): array + { + return $this->sitemaps; + } + + /** + * @throws Exception + */ public function isAllowed(string $uri, string $userAgent): bool { $matchingGroups = $this->getGroupsMatchingUserAgent($userAgent); @@ -61,7 +67,7 @@ public function isAllowed(string $uri, string $userAgent): bool /** * Find all groups that match a certain user agent string. * - * @return array|UserAgentGroup[] + * @return UserAgentGroup[] */ private function getGroupsMatchingUserAgent(string $userAgent): array { @@ -77,7 +83,7 @@ private function getGroupsMatchingUserAgent(string $userAgent): array } /** - * @param array|UserAgentGroup[] $groups + * @param UserAgentGroup[] $groups */ private function combineGroups(array $groups): UserAgentGroup { diff --git a/src/RulePattern.php b/src/RulePattern.php index bbebe7b..26e63a6 100644 --- a/src/RulePattern.php +++ b/src/RulePattern.php @@ -3,7 +3,7 @@ namespace Crwlr\RobotsTxt; use Crwlr\Url\Url; -use InvalidArgumentException; +use Exception; final class RulePattern { @@ -21,14 +21,10 @@ public function pattern(): string } /** - * @param string|Url|mixed $uri + * @throws Exception */ - public function matches($uri): bool + public function matches(string|Url $uri): bool { - if (!$uri instanceof Url && !is_string($uri)) { - throw new InvalidArgumentException('Argument $uri must be a string or instance of Crwlr\Url.'); - } - $path = $uri instanceof Url ? $uri->path() : Url::parse($uri)->path(); if (!is_string($path)) { diff --git a/src/UserAgentGroup.php b/src/UserAgentGroup.php index 6f327b4..768cf69 100644 --- a/src/UserAgentGroup.php +++ b/src/UserAgentGroup.php @@ -3,37 +3,31 @@ namespace Crwlr\RobotsTxt; use Crwlr\Url\Url; +use Exception; use InvalidArgumentException; final class UserAgentGroup { /** - * @var array|string[] - */ - private array $userAgents; - - /** - * @var array|RulePattern[] + * @var RulePattern[] */ private array $disallowedPatterns = []; /** - * @var array|RulePattern[] + * @var RulePattern[] */ private array $allowedPatterns = []; /** - * @param array|string[] $userAgents + * @param string[] $userAgents */ - public function __construct(array $userAgents) + public function __construct(private array $userAgents) { foreach ($userAgents as $userAgent) { if (!is_string($userAgent)) { throw new InvalidArgumentException('Argument $userAgents must exclusively contain user agent strings.'); } } - - $this->userAgents = $userAgents; } public function contains(string $userAgent): bool @@ -47,6 +41,9 @@ public function contains(string $userAgent): bool return false; } + /** + * @throws Exception + */ public function isAllowed(string $uri): bool { $uri = Url::parse($uri); @@ -66,7 +63,7 @@ public function isAllowed(string $uri): bool } /** - * @return array|string[] + * @return string[] */ public function userAgents(): array { @@ -74,7 +71,7 @@ public function userAgents(): array } /** - * @return array|RulePattern[] + * @return RulePattern[] */ public function disallowedPatterns(): array { @@ -82,7 +79,7 @@ public function disallowedPatterns(): array } /** - * @return array|RulePattern[] + * @return RulePattern[] */ public function allowedPatterns(): array { @@ -100,7 +97,8 @@ public function addAllowedPattern(RulePattern $pattern): void } /** - * @return array|RulePattern[] + * @return RulePattern[] + * @throws Exception */ private function getMatchingDisallowedPatterns(Url $url): array { @@ -108,7 +106,8 @@ private function getMatchingDisallowedPatterns(Url $url): array } /** - * @return array|RulePattern[] + * @return RulePattern[] + * @throws Exception */ private function getMatchingAllowedPatterns(Url $url): array { @@ -116,8 +115,9 @@ private function getMatchingAllowedPatterns(Url $url): array } /** - * @param array|RulePattern[] $patterns - * @return array|RulePattern[] + * @param RulePattern[] $patterns + * @return RulePattern[] + * @throws Exception */ private function getMatchingPatterns(Url $url, array $patterns): array { @@ -138,8 +138,8 @@ private function getMatchingPatterns(Url $url, array $patterns): array * "The most specific match found MUST be used. The most specific match is the match that has the most octets. * If an allow and disallow rule is equivalent, the allow SHOULD be used." * - * @param array|RulePattern[] $disallowedPatterns - * @param array|RulePattern[] $allowedPatterns + * @param RulePattern[] $disallowedPatterns + * @param RulePattern[] $allowedPatterns */ private function isAllowedByMostSpecificMatch(array $disallowedPatterns, array $allowedPatterns): bool { diff --git a/tests/ParserTest.php b/tests/ParserTest.php index 5933d71..e09dfa6 100644 --- a/tests/ParserTest.php +++ b/tests/ParserTest.php @@ -318,6 +318,33 @@ public function test_parse_mixed_disallow_and_allow_rules_to_multiple_user_agent $this->assertArrayOfPatterns(['/hidden-for-most-bots'], $group3->allowedPatterns()); } + public function test_parse_sitemap_lines(): void + { + $robotsTxtContent = <<parse($robotsTxtContent); + + $this->assertCount(3, $robotsTxt->sitemaps()); + + $this->assertEquals([ + 'https://www.example.com/sitemap1.xml', + 'https://www.example.com/sitemap2.xml', + 'https://www.example.org/sitemap3.xml', + ], $robotsTxt->sitemaps()); + } + /** * @param string[] $expected * @param RulePattern[] $actual diff --git a/tests/RulePatternTest.php b/tests/RulePatternTest.php index 227c8ad..88ea576 100644 --- a/tests/RulePatternTest.php +++ b/tests/RulePatternTest.php @@ -3,7 +3,6 @@ declare(strict_types=1); use Crwlr\RobotsTxt\RulePattern; -use Crwlr\Url\Url; use PHPUnit\Framework\TestCase; final class RulePatternTest extends TestCase @@ -14,15 +13,6 @@ public function test_it_returns_the_raw_pattern(): void $this->assertEquals('/fo%6F/*/baz$', $pattern->pattern()); } - public function test_matches_accepts_only_string_or_url_object_as_param(): void - { - $this->assertTrue((new RulePattern('/foo'))->matches('/foo')); - $this->assertTrue((new RulePattern('/foo'))->matches(Url::parse('/foo'))); - - $this->expectException(InvalidArgumentException::class); - (new RulePattern('/foo'))->matches(123); - } - public function test_match_an_exact_match(): void { $this->assertTrue((new RulePattern('/home'))->matches('/home'));