Skip to content

Commit

Permalink
Merge branch 'main' of github.com:hexydec/agentzero
Browse files Browse the repository at this point in the history
  • Loading branch information
hexydec committed Jun 20, 2024
2 parents c98a0af + 2aaa7e8 commit b3892bd
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 19 deletions.
34 changes: 17 additions & 17 deletions src/mappings/crawlers.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public static function getApp(string $value, array $data = []) : array {
'duckduckgo-favicons-bot' => 'search',
'coccocbot-image' => 'search',
'coccocbot-web' => 'search',
'applebot' => 'search',
'applebot' => 'ai',
'yandexbot' => 'search',
'mj12bot' => 'search',
'mail.ru_bot' => 'search',
Expand All @@ -59,7 +59,13 @@ public static function getApp(string $value, array $data = []) : array {
'telegrambot' => 'feed',
'semrushbot' => 'crawler',
'mediatoolkitbot' => 'crawler',
'iploggerbot' => 'monitor'
'iploggerbot' => 'monitor',
'baiduspider' => 'search',
'haosouspider' => 'search',
'yisouspider' => 'search',
'360spider' => 'search',
'sogou web spider' => 'search',
'bytespider' => 'crawler'
];
$apps = [
'yacybot' => 'YacyBot',
Expand Down Expand Up @@ -94,6 +100,7 @@ public static function getApp(string $value, array $data = []) : array {
'mediatoolkitbot' => 'MediaToolkitBot',
'cfnetwork' => 'Apple Core Foundation Network',
'ncsc web check [email protected]' => 'NCSC Web Check',
'enhanced webcheck [email protected]' => 'NCSC Enhanced Web Check',
'the national archives uk government web archive:' => 'UK Government National Archives',
'google-site-verification' => 'Google Site Verification',
'google-inspectiontool' => 'Google Inspection Tool',
Expand All @@ -115,17 +122,20 @@ public static function getApp(string $value, array $data = []) : array {
'citoid' => 'Wikimedia Citoid',
'censysinspect' => 'Censys Inspect',
'googledocs' => 'Google Docs',
'user-agent: seolyt' => 'SEOlyt'
'user-agent: seolyt' => 'SEOlyt',
'bytespider' => 'ByteDance Spider',
'[email protected]' => 'ByteDance Spider'
];

$lower = \mb_strtolower($parts[0]);
return \array_merge([
'type' => 'robot',
'category' => $category[$lower] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper'),
'app' => $apps[$lower] ?? $parts[0],
'appname' => $parts[0],
'appversion' => empty($parts[1]) ? null : $parts[1]
], $data);
], $data, [
'category' => $category[$lower] ?? $data['category'] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper')
]);
}
return [];
}
Expand All @@ -150,18 +160,7 @@ public static function get() : array {
]
)),
'crawler' => function (string $value) : array {
$parts = \explode('/', $value, 2);
$map = [
'baiduspider' => 'search',
'haosouspider' => 'search',
'yisouspider' => 'search',
'360spider' => 'search',
'sogou web spider' => 'search',
'bytespider' => 'search',
];
return self::getApp($value, [
'category' => $map[\mb_strtolower($parts[0])] ?? 'crawler'
]);
return self::getApp($value, ['category' => 'crawler']);
},
'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']),
'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']),
Expand Down Expand Up @@ -253,6 +252,7 @@ public static function get() : array {
'Uptime/' => new props('start', $fn['monitor']),
'HostTracker/' => new props('start', $fn['monitor']),
'NCSC Web Check [email protected]' => new props('exact', $fn['monitor']),
'Enhanced WebCheck [email protected]' => new props('exact', $fn['monitor']),
'Pingdom.com' => new props('start', function (string $value) : array {
$version = \explode('_', \trim($value, '_'));
return [
Expand Down
4 changes: 2 additions & 2 deletions tests/crawlersTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,8 @@ public function testSearch() : void {
'Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; https://zhanzhang.toutiao.com/)' => [
'string' => 'Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; https://zhanzhang.toutiao.com/)',
'type' => 'robot',
'category' => 'search',
'app' => 'Bytespider',
'category' => 'crawler',
'app' => 'ByteDance Spider',
'appname' => 'Bytespider',
'url' => 'https://zhanzhang.toutiao.com/',
'platform' => 'Android',
Expand Down

0 comments on commit b3892bd

Please sign in to comment.