Skip to content

Commit

Permalink
Fixed stray tests and added a few more
Browse files Browse the repository at this point in the history
  • Loading branch information
eob committed Oct 1, 2024
1 parent 6c57dde commit d6f8263
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 87 deletions.
13 changes: 12 additions & 1 deletion fixtures/crawlers.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
Google Notebook LLM:
- Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)GoogleOther
Google Extended:
- Google-Extended
OpenAI SearchBot:
- Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot
OpenAI ChatGPT User:
- Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot
OpenAI GPTBot:
- Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot
- Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot
ClaudBot:
- Claude-Web/1.0 (web crawler; +https://www.anthropic.com/; [email protected])
- ClaudeBot
- anthropic-ai
- Claude-Web
PerplexityBot:
- PerplexityBot
Cohere:
- cohere-ai
20 changes: 4 additions & 16 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,41 +1,29 @@
import patternsList from "./patterns.json";
import { fullPattern } from "./pattern";

/**
* Naive bot pattern.
*/
const naivePattern = /bot|crawl|http|lighthouse|scan|search|spider/i;

let pattern: RegExp;
export function getPattern(): RegExp {
export function getPattern(): RegExp | null {
if (pattern instanceof RegExp) {
return pattern;
}
try {
// Build this RegExp dynamically to avoid syntax errors in older engines.
pattern = new RegExp(fullPattern, "i");
return new RegExp(fullPattern, "i");
} catch (error) {
pattern = naivePattern;
return null;
}
return pattern;
}

/**
* A list of bot identifiers to be used in a regular expression against user agent strings.
*/
export const list: string[] = patternsList.map((pattern) => pattern.pattern);

/**
* Check if the given user agent includes a bot pattern. Naive implementation (less accurate).
*/
export const isaiNaive = (userAgent?: string | null): boolean =>
Boolean(userAgent) && naivePattern.test(userAgent);

/**
* Check if the given user agent includes a bot pattern.
*/
export function isai(userAgent?: string | null): boolean {
return Boolean(userAgent) && getPattern().test(userAgent);
return Boolean(userAgent) && getPattern()?.test(userAgent) || false;
}

/**
Expand Down
48 changes: 42 additions & 6 deletions src/patterns.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
]
},
{
"pattern": "https://openai.com/gptbot",
"pattern": "openai.*bot",
"confidence": 1.0,
"reasonsFor": [
"Published user agent substring by OpenAI"
Expand All @@ -19,21 +19,57 @@
]
},
{
"pattern": "https://openai.com/searchbot",
"pattern": "^ClaudeBot",
"confidence": 1.0,
"reasonsFor": [
"Published user agent substring by OpenAI"
"Reported Anthropic Claude user agent string"
],
"reasonsAgainst": [
]
},
{
"pattern": "https://openai.com/bot",
"pattern": "^anthropic",
"confidence": 1.0,
"reasonsFor": [
"Published user agent substring by OpenAI"
"Reported Anthropic Claude user agent string"
],
"reasonsAgainst": [
]
},
{
"pattern": "^Claude-Web",
"confidence": 1.0,
"reasonsFor": [
"Reported Anthropic Claude user agent string"
],
"reasonsAgainst": [
]
},
{
"pattern": "^PerplexityBot",
"confidence": 1.0,
"reasonsFor": [
"Reported Perplexity bot string"
],
"reasonsAgainst": [
]
},
{
"pattern": "^cohere-ai",
"confidence": 1.0,
"reasonsFor": [
"Reported Cohere bot string"
],
"reasonsAgainst": [
]
},
{
"pattern": "^Google-Extended",
"confidence": 1.0,
"reasonsFor": [
"Reported Google Bard bot string"
],
"reasonsAgainst": [
]
}
}
]
4 changes: 0 additions & 4 deletions tests/spec/__snapshots__/test.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ exports[`isai module interface interface is as expected 1`] = `
"list",
"Array",
],
[
"isaiNaive",
"Function",
],
[
"createisai",
"Function",
Expand Down
74 changes: 14 additions & 60 deletions tests/spec/test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import {
getPattern,
list,
isai,
isaiNaive,
isaiMatch,
isaiMatches,
isaiPattern,
Expand All @@ -19,16 +18,6 @@ const AI_USER_AGENT_EXAMPLE =
const BROWSER_USER_AGENT_EXAMPLE =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91 Safari/537.36";

const USER_AGENT_COMMON = [
"Ada Chat Bot/1.0 Request Block",
"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4590.2 Safari/537.36 Chrome-Lighthouse",
];
const USER_AGENT_GOTCHAS = [
"Mozilla/5.0 (Linux; Android 10; CUBOT_X30) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.85 Mobile Safari/537.36",
"PS4Application libhttp/1.000 (PS4) CoreMedia libhttp/6.72 (PlayStation 4)",
];

describe("isai", () => {
describe("features", () => {
test("pattern: pattern is a regex", () => {
Expand All @@ -42,20 +31,20 @@ describe("isai", () => {
expect(isai(AI_USER_AGENT_EXAMPLE)).toBe(true);
});
test("isaiMatch: find pattern in bot user agent string", () => {
expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("https://openai.com/searchbot");
expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("openai.com/searchbot");
});
test("isaiMatches: find all patterns in bot user agent string", () => {
expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("https://openai.com/searchbot");
expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("openai.com/searchbot");
expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toHaveLength(1);
});
test("isaiPattern: find first pattern in bot user agent string", () => {
expect(isaiPattern(AI_USER_AGENT_EXAMPLE)).toBe(
"https://openai.com/searchbot",
"openai.*bot",
);
});
test("isaiPatterns: find all patterns in bot user agent string", () => {
expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toContain(
"https://openai.com/searchbot",
"openai.*bot",
);
expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toHaveLength(1);
});
Expand All @@ -64,19 +53,19 @@ describe("isai", () => {
expect(customisai(AI_USER_AGENT_EXAMPLE)).toBe(true);
});
test("createisaiFromList: create custom isai function with custom pattern", () => {
const ChromeLighthouseUserAgentStrings: string[] = [
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot",
const ToRemoveStrings: string[] = [
"openai.*bot"
];
const patternsToRemove: Set<string> = new Set(
ChromeLighthouseUserAgentStrings.map(isaiMatches).flat(),
ToRemoveStrings.map(isaiMatches).flat(),
);
const isai2 = createisaiFromList(
list.filter(
(record: string): boolean => patternsToRemove.has(record) === false,
),
expect(patternsToRemove.size).toBeGreaterThan(0);
const list2 = list.filter(
(record: string): boolean => patternsToRemove.has(record) === false,
);
const [ua] = ChromeLighthouseUserAgentStrings;
expect(list2.length).toBeLessThan(list.length);
const isai2 = createisaiFromList(list2);
const ua = "https://openai.com/gptbot"
expect(isai(ua)).toBe(true);
expect(isai2(ua)).toBe(false);
});
Expand All @@ -92,27 +81,6 @@ describe("isai", () => {
);
});

describe("isaiNaive", () => {
test.each([75])(
"a large number of user agent strings can be detected (>%s%)",
(percent) => {
const ratio =
crawlers.filter((ua) => isaiNaive(ua)).length / crawlers.length;
expect(ratio).toBeLessThanOrEqual(1);
expect(ratio).toBeGreaterThan(percent / 100);
},
);
test.each([1])(
"a small number of browsers is falsly detected as bots (<%s%)",
(percent) => {
const ratio =
browsers.filter((ua) => isaiNaive(ua)).length / browsers.length;
expect(ratio).toBeGreaterThan(0);
expect(ratio).toBeLessThan(percent / 100);
},
);
});

describe("regex fallback", () => {
beforeAll(async () => {
jest
Expand All @@ -132,20 +100,6 @@ describe("isai", () => {
afterAll(() => {
jest.restoreAllMocks();
});
test("fallback regex detects commong crawlers", () => {
USER_AGENT_COMMON.forEach((ua) => {
if (!isaiInstance(ua)) {
throw new Error(`Failed to detect ${ua} as bot`);
}
});
});
test("fallback detects gotchas as bots", () => {
USER_AGENT_GOTCHAS.forEach((ua) => {
if (!isaiInstance(ua)) {
throw new Error(`Failed to detect ${ua} as bot (gotcha)`);
}
});
});
test("fallback does not detect browser as bot", () => {
expect(isaiInstance(BROWSER_USER_AGENT_EXAMPLE)).toBe(false);
});
Expand Down Expand Up @@ -189,7 +143,7 @@ describe("isai", () => {
});
test("regular expressions exports are as expected", () => {
expect(new RegExp(fullPattern, "i").toString()).toBe(
getPattern().toString(),
getPattern()?.toString(),
);
});
});
Expand Down

0 comments on commit d6f8263

Please sign in to comment.