Skip to content

Commit

Permalink
Added sitemap.xml fallback if not found in robots.txt.
Browse files Browse the repository at this point in the history
  • Loading branch information
ivopetkov committed Jun 10, 2021
1 parent f596bbb commit 36f884f
Showing 1 changed file with 30 additions and 22 deletions.
52 changes: 30 additions & 22 deletions classes/Audits/Internal/Utilities.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,32 @@ static function initializeAudit(string $id): void
}

$urls = [];

$processSitemapURL = function ($sitemapURL) use (&$data, &$urls) {
$result = self::makeRequest($sitemapURL);
if ($result['status'] === 200) {
$dom = new DOMDocument();
try {
$dom->loadXML($result['content']);
$elements = $dom->getElementsByTagName('url');
foreach ($elements as $element) {
$locationElements = $element->getElementsByTagName('loc');
if ($locationElements->length === 1) {
$urls[] = $locationElements->item(0)->nodeValue;
}
}
return true;
} catch (\Exception $e) {
$data['e'] = 'Error finding URLs in ' . $sitemapURL;
}
} else {
$data['e'] = 'There is a problem with ' . $sitemapURL . ' (status:' . $result['status'] . ')';
}
return false;
};

$maxPagesCount = $data['m'];

$robotsURL = $data['u'] . 'robots.txt';
$result = self::makeRequest($robotsURL);
if ($result['status'] === 200) {
Expand All @@ -49,34 +75,16 @@ static function initializeAudit(string $id): void
if (strlen($robotsLine) === 0) {
continue;
}
if (strpos($robotsLine, 'disallow:') === 0) {
$data['a'] = (int) ($robotsLine === 'disallow:');
if (str_replace(' ', '', $robotsLine) === 'disallow:/') {
$data['a'] = false;
} elseif (strpos($robotsLine, 'sitemap:') === 0) {
$sitemapURL = trim(substr($robotsLine, 8));
}
}
if (strlen($sitemapURL) > 0) {
$result = self::makeRequest($sitemapURL);
if ($result['status'] === 200) {
$maxPagesCount = $data['m'];
$dom = new DOMDocument();
try {
$dom->loadXML($result['content']);
$elements = $dom->getElementsByTagName('url');
foreach ($elements as $element) {
$locationElements = $element->getElementsByTagName('loc');
if ($locationElements->length === 1) {
$urls[] = $locationElements->item(0)->nodeValue;
}
}
} catch (\Exception $e) {
$data['e'] = 'Error finding URLs in ' . $sitemapURL;
}
} else {
$data['e'] = 'There is a problem with ' . $sitemapURL . ' (status:' . $result['status'] . ')';
}
$processSitemapURL($sitemapURL);
} else {
$data['e'] = 'Cannot find sitemap URL in ' . $robotsURL;
$processSitemapURL($data['u'] . 'sitemap.xml');
}
} else {
$data['e'] = 'There is a problem with ' . $robotsURL . ' (status:' . $result['status'] . ')';
Expand Down

0 comments on commit 36f884f

Please sign in to comment.