Skip to content

Commit

Permalink
make it quicker and can pass skip folder do not want to crawl
Browse files Browse the repository at this point in the history
  • Loading branch information
FTAndy committed Dec 19, 2023
1 parent 71078f5 commit 6f157a3
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 83 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ This package will crawl source code and doc files from the Github repo and gener
- Analyze and learn the architecture.
- ...more

Here are some GPTs created to analyze code:

- [react-virtuoso](https://chat.openai.com/g/g-WOhTJGfKu-react-virtuoso-analyzer) to analyze https://github.com/petyosi/react-virtuoso
- [gpt-crawler](https://chat.openai.com/g/g-Cnxt38AbK-gpt-crawler-repo-analyzer) to analyze https://github.com/BuilderIO/gpt-crawler

![demo](https://github.com/FTAndy/gpt-crawler/blob/main/demo.png?raw=true)

# Github repo GPT Crawler <!-- omit from toc -->
Expand Down
30 changes: 26 additions & 4 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@ import inquirer from "inquirer";
const messages = {
url: "What is the URL of the github project you want to crawl?",
branch: "What is the branch? (default to master)",
skipFolders: "What is the folder you want to skip? (use ',' to split)",
};

async function handler(options: GithubConfig) {
try {
let {
githubRepoUrl,
branch
} = options;
branch,
skipFolders
} = options as GithubConfig & { skipFolders?: string[] | string; } ;

console.log(skipFolders, 'skipFolders')

const questions = [];

Expand All @@ -36,16 +40,33 @@ async function handler(options: GithubConfig) {
});
}

if (!skipFolders) {
questions.push({
type: "input",
name: "skipFolders",
message: messages.skipFolders,
});
}

const answers = await inquirer.prompt(questions);



githubRepoUrl = githubRepoUrl || answers.url;
branch = branch || answers.branch || 'master';
skipFolders = skipFolders || answers.skipFolders || '';
skipFolders = typeof skipFolders === 'string' ? skipFolders.split(',').map(s => s.trim()) : []

console.log(githubRepoUrl, branch)
console.log({
githubRepoUrl,
branch,
skipFolders
})

crawlerGithubForGPT({
githubRepoUrl,
branch: branch
branch,
skipFolders
})

} catch (error) {
Expand All @@ -58,6 +79,7 @@ program.version(packageJSON.version)
program
.option("-u, --githubRepoUrl <string>", messages.url, "")
.option("-m, --branch <string>", messages.branch, "")
.option("-m, --skipFolders <string>", messages.skipFolders, "")
.action(handler);

program.parse();
104 changes: 63 additions & 41 deletions src/config.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,39 @@
import { z } from 'zod';
import { z } from "zod";

import type { Page } from "playwright";

const Page: z.ZodType<Page> = z.any();

/**
* Pattern to match against for links on a page to subsequently crawl
* @example "https://www.builder.io/c/docs/**"
* @default ""
*/
export const OriginMatch = z.string().or(z.array(z.string()));

export const PatternMatch = z.array(
z.object({
/**
* Pattern to match against for links on a page to subsequently crawl
* @example "https://www.builder.io/c/docs/**"
* @refer https://github.com/isaacs/minimatch
* @default ""
*/
pattern: z.string(),
/**
* Selector to grab the inner text from, limited to pattern
* @example ".docs-builder-container"
* @default "body"
*/
selector: z.string().optional(),
/**
* Skip to grap inner text for this pattern
* @default false
*/
skip: z.boolean().optional(),
}),
);

export const configSchema = z.object({
/**
* URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap
Expand All @@ -17,28 +47,7 @@ export const configSchema = z.object({
* @example "https://www.builder.io/c/docs/**"
* @default ""
*/
// match: z.string().or(z.array(z.string())),
/**
* Pattern to match against for links on a page to subsequently crawl
* @example "https://www.builder.io/c/docs/**"
* @default ""
*/
match: z.array(z.object({
/**
* Pattern to match against for links on a page to subsequently crawl
* @example "https://www.builder.io/c/docs/**"
* @default ""
*/
pattern: z.string(),
/**
* Selector to grab the inner text from
* @example ".docs-builder-container"
* @default ""
*/
selector: z.string().optional(),
skip: z.boolean().optional()
})),

match: OriginMatch.or(PatternMatch),
/**
* Selector to grab the inner text from
* @example ".docs-builder-container"
Expand All @@ -56,29 +65,42 @@ export const configSchema = z.object({
*/
outputFileName: z.string(),
/** Optional cookie to be set. E.g. for Cookie Consent */
cookie: z.object({
name: z.string(),
value: z.string(),
}).optional(),
cookie: z
.object({
name: z.string(),
value: z.string(),
})
.optional(),
/** Optional function to run for each page found */
onVisitPage: z.function()
.args(z.object({
onVisitPage: z
.function()
.args(
z.object({
page: Page,
pushData: z.function()
.args(z.any())
.returns(z.promise(z.void()))
}))
.returns(z.promise(z.void()))
.optional(),
pushData: z.function().args(z.any()).returns(z.promise(z.void())),
}),
)
.returns(z.promise(z.void()))
.optional(),
/** Optional timeout for waiting for a selector to appear */
waitForSelectorTimeout: z.number().int().nonnegative().optional(),
/** Optional resources to exclude
*
* @example
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
*/
/** Optional resources to exclude
*
* @example
* ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
*/
resourceExclusions: z.array(z.string()).optional(),

/** Optional maximum file size in megabytes to include in the output file
* @example 1
*/
maxFileSize: z.number().int().positive().optional(),
/** Optional maximum number tokens to include in the output file
* @example 5000
*/
maxTokens: z.number().int().positive().optional(),
});

export type Config = z.infer<typeof configSchema>;

export type PatternMatchType = z.infer<typeof PatternMatch>;
export type OriginMatchType = z.infer<typeof OriginMatch>;
57 changes: 41 additions & 16 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import { minimatch } from "minimatch";
import {Config, configSchema} from "./config.js";
import {Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType} from "./config.js";
import { Page } from "playwright";

let pageCounter = 0;
Expand Down Expand Up @@ -72,29 +72,55 @@ export async function crawl(config: Config) {
);
pageCounter++;

const matchedPattern = config.match.find((match) => {
return minimatch(request.url, match.pattern);
})

// Use custom handling for XPath selector
if (matchedPattern && !matchedPattern.skip) {
const selector = matchedPattern?.selector || 'body';
if (selector.startsWith("/")) {
let globs: string | string[] = [];

if (PatternMatch.safeParse(config.match).success) {
const matchPattern = config.match as PatternMatchType;
globs = matchPattern.filter(s => !s.skip).map((s) => s.pattern);
const matchedPattern = matchPattern.find((match) => {
return minimatch(request.url, match.pattern);
});
if (matchedPattern && !matchedPattern.skip) {
const selector = matchedPattern?.selector || "body";
// Use custom handling for XPath selector
if (selector.startsWith("/")) {
await waitForXPath(
page,
selector,
config.waitForSelectorTimeout ?? 1000,
);
} else {
await page.waitForSelector(selector, {
timeout: config.waitForSelectorTimeout ?? 1000,
});
}
const html = await getPageHtml(page, selector);

// Save results as JSON to ./storage/datasets/default
await pushData({ title, url: request.loadedUrl, html });
}
} else if (
OriginMatch.safeParse(config.match).success &&
config.selector
) {
const match = config.match as OriginMatchType;
globs = typeof match === "string" ? [match] : match;
// Use custom handling for XPath selector
if (config.selector.startsWith("/")) {
await waitForXPath(
page,
selector,
config.waitForSelectorTimeout ?? 1000
config.selector,
config.waitForSelectorTimeout ?? 1000,
);
} else {
await page.waitForSelector(selector, {
await page.waitForSelector(config.selector, {
timeout: config.waitForSelectorTimeout ?? 1000,
});
}
const html = await getPageHtml(page, selector);
const html = await getPageHtml(page, config.selector);

// Save results as JSON to ./storage/datasets/default
await pushData({ title, url: request.loadedUrl, html });

}

if (config.onVisitPage) {
Expand All @@ -104,8 +130,7 @@ export async function crawl(config: Config) {
// Extract links from the current page
// and add them to the crawling queue.
await enqueueLinks({
globs:
config.match.map(s => s.pattern),
globs,
});
},
// Comment this option to scrape the full website.
Expand Down
45 changes: 25 additions & 20 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,44 @@ const githubConfigSchema = zod.object({
githubRepoUrl: zod.string(),
tag: zod.string().optional(),
branch: zod.string().optional(),
skipFolders: zod.array(zod.string()).optional(),
})

export type GithubConfig = zod.infer<typeof githubConfigSchema>;


export async function crawlerGithubForGPT(config: GithubConfig) {
const { githubRepoUrl, tag, branch } = githubConfigSchema.parse(config);
const { githubRepoUrl, tag, branch, skipFolders } = githubConfigSchema.parse(config);

const path = tag || branch || 'master'

const treeEndPointUrl = `${githubRepoUrl}/tree/${path}`
const blobEndPointUrl = `${githubRepoUrl}/blob/${path}`

const match = [
{
// speical case for .md
// pattern: 'https://github.com/BuilderIO/gpt-crawler/blob/main/**/*.md',
pattern: `${blobEndPointUrl}/**/*.md`,
selector: '.markdown-body'
},
{
// other files like .js, .ts, .json, etc
pattern: `${blobEndPointUrl}/**`,
selector: '#read-only-cursor-text-area'
},
{
// skip the folder content
// pattern: "https://github.com/BuilderIO/gpt-crawler/tree/main/**",
pattern: `${treeEndPointUrl}/${skipFolders?.length ? `!(${skipFolders.join('|')})` : '**'}`,
},
]

console.log(match)

const innerConfig: Config = {
url: treeEndPointUrl,
match: [
{
// skip the folder content
// pattern: "https://github.com/BuilderIO/gpt-crawler/tree/main/**",
pattern: `${treeEndPointUrl}/**`,
skip: true
},
{
// speical case for .md
// pattern: 'https://github.com/BuilderIO/gpt-crawler/blob/main/**/*.md',
pattern: `${blobEndPointUrl}/**/*.md`,
selector: '.markdown-body'
},
{
// other files like .js, .ts, .json, etc
pattern: `${blobEndPointUrl}/**`,
selector: '#read-only-cursor-text-area'
},
],
match,
// selector: '#read-only-cursor-text-area',
maxPagesToCrawl: 500,
waitForSelectorTimeout: 1000 * 5,
Expand Down
5 changes: 3 additions & 2 deletions src/main.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { crawlerGithubForGPT } from "./index.js";

crawlerGithubForGPT({
githubRepoUrl: 'https://github.com/BuilderIO/gpt-crawler',
branch: 'main'
githubRepoUrl: 'https://github.com/petyosi/react-virtuoso',
branch: 'master',
skipFolders: ['blog', 'docs', 'e2e', 'examples', 'site', 'docusaurus']
})

0 comments on commit 6f157a3

Please sign in to comment.