make it quicker and can pass skip folder do not want to crawl

FTAndy · Dec 19, 2023 · 6f157a3 · 6f157a3
1 parent 71078f5
commit 6f157a3
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -7,6 +7,11 @@ This package will crawl source code and doc files from the Github repo and gener
 - Analyze and learn the architecture.
 - ...more
 
+Here are some GPTs created to analyze code:
+
+- [react-virtuoso](https://chat.openai.com/g/g-WOhTJGfKu-react-virtuoso-analyzer) to analyze https://github.com/petyosi/react-virtuoso
+- [gpt-crawler](https://chat.openai.com/g/g-Cnxt38AbK-gpt-crawler-repo-analyzer) to analyze https://github.com/BuilderIO/gpt-crawler
+
 ![demo](https://github.com/FTAndy/gpt-crawler/blob/main/demo.png?raw=true)
 
 # Github repo GPT Crawler <!-- omit from toc -->

diff --git a/src/cli.ts b/src/cli.ts
@@ -9,14 +9,18 @@ import inquirer from "inquirer";
 const messages = {
   url: "What is the URL of the github project you want to crawl?",
   branch: "What is the branch? (default to master)",
+  skipFolders: "What is the folder you want to skip? (use ',' to split)",
 };
 
 async function handler(options: GithubConfig) {
   try {
     let {
       githubRepoUrl,
-      branch
-    } = options;
+      branch,
+      skipFolders
+    } = options as GithubConfig & { skipFolders?: string[] | string; } ;
+
+    console.log(skipFolders, 'skipFolders')
 
     const questions = [];
 
@@ -36,16 +40,33 @@ async function handler(options: GithubConfig) {
       });
     }
 
+    if (!skipFolders) {
+      questions.push({
+        type: "input",
+        name: "skipFolders",
+        message: messages.skipFolders,
+      });  
+    }
+
     const answers = await inquirer.prompt(questions);
 
+
+
     githubRepoUrl = githubRepoUrl || answers.url;
     branch = branch || answers.branch || 'master';
+    skipFolders = skipFolders || answers.skipFolders || '';
+    skipFolders = typeof skipFolders === 'string' ? skipFolders.split(',').map(s => s.trim()) : []
 
-    console.log(githubRepoUrl, branch)
+    console.log({
+      githubRepoUrl,
+      branch,
+      skipFolders
+    })
 
     crawlerGithubForGPT({
       githubRepoUrl,
-      branch: branch
+      branch,
+      skipFolders
     })
 
   } catch (error) {
@@ -58,6 +79,7 @@ program.version(packageJSON.version)
 program
   .option("-u, --githubRepoUrl <string>", messages.url, "")
   .option("-m, --branch <string>", messages.branch, "")
+  .option("-m, --skipFolders <string>", messages.skipFolders, "")
   .action(handler);
 
 program.parse();
diff --git a/src/config.ts b/src/config.ts
@@ -1,9 +1,39 @@
-import { z } from 'zod';
+import { z } from "zod";
 
 import type { Page } from "playwright";
 
 const Page: z.ZodType<Page> = z.any();
 
+/**
+ * Pattern to match against for links on a page to subsequently crawl
+ * @example "https://www.builder.io/c/docs/**"
+ * @default ""
+ */
+export const OriginMatch = z.string().or(z.array(z.string()));
+
+export const PatternMatch = z.array(
+  z.object({
+    /**
+     * Pattern to match against for links on a page to subsequently crawl
+     * @example "https://www.builder.io/c/docs/**"
+     * @refer https://github.com/isaacs/minimatch
+     * @default ""
+     */
+    pattern: z.string(),
+    /**
+     * Selector to grab the inner text from, limited to pattern
+     * @example ".docs-builder-container"
+     * @default "body"
+     */
+    selector: z.string().optional(),
+    /**
+     * Skip to grap inner text for this pattern
+     * @default false
+     */
+    skip: z.boolean().optional(),
+  }),
+);
+
 export const configSchema = z.object({
   /**
    * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap
@@ -17,28 +47,7 @@ export const configSchema = z.object({
    * @example "https://www.builder.io/c/docs/**"
    * @default ""
    */
-  // match: z.string().or(z.array(z.string())),
-  /**
-   * Pattern to match against for links on a page to subsequently crawl
-   * @example "https://www.builder.io/c/docs/**"
-   * @default ""
-   */
-  match: z.array(z.object({
-    /** 
-     * Pattern to match against for links on a page to subsequently crawl
-     * @example "https://www.builder.io/c/docs/**"
-     * @default ""
-     */
-    pattern: z.string(),
-    /**
-     * Selector to grab the inner text from
-     * @example ".docs-builder-container"
-     * @default ""
-     */
-    selector: z.string().optional(),
-    skip: z.boolean().optional()
-  })),
-
+  match: OriginMatch.or(PatternMatch),
   /**
    * Selector to grab the inner text from
    * @example ".docs-builder-container"
@@ -56,29 +65,42 @@ export const configSchema = z.object({
    */
   outputFileName: z.string(),
   /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie: z.object({
-    name: z.string(),
-    value: z.string(),
-  }).optional(),
+  cookie: z
+    .object({
+      name: z.string(),
+      value: z.string(),
+    })
+    .optional(),
   /** Optional function to run for each page found */
-  onVisitPage: z.function()
-      .args(z.object({
+  onVisitPage: z
+    .function()
+    .args(
+      z.object({
         page: Page,
-        pushData: z.function()
-            .args(z.any())
-            .returns(z.promise(z.void()))
-      }))
-      .returns(z.promise(z.void()))
-      .optional(),
+        pushData: z.function().args(z.any()).returns(z.promise(z.void())),
+      }),
+    )
+    .returns(z.promise(z.void()))
+    .optional(),
   /** Optional timeout for waiting for a selector to appear */
   waitForSelectorTimeout: z.number().int().nonnegative().optional(),
-  /** Optional resources to exclude 
-     * 
-     * @example
-     * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
-    */
+  /** Optional resources to exclude
+   *
+   * @example
+   * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab']
+   */
   resourceExclusions: z.array(z.string()).optional(),
+
+  /** Optional maximum file size in megabytes to include in the output file
+   * @example 1
+   */
+  maxFileSize: z.number().int().positive().optional(),
+  /** Optional maximum number tokens to include in the output file
+   * @example 5000
+   */
+  maxTokens: z.number().int().positive().optional(),
 });
 
 export type Config = z.infer<typeof configSchema>;
-
+export type PatternMatchType = z.infer<typeof PatternMatch>;
+export type OriginMatchType = z.infer<typeof OriginMatch>;
diff --git a/src/core.ts b/src/core.ts
@@ -3,7 +3,7 @@ import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import { minimatch } from "minimatch";
-import {Config, configSchema} from "./config.js";
+import {Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType} from "./config.js";
 import { Page } from "playwright";
 
 let pageCounter = 0;
@@ -72,29 +72,55 @@ export async function crawl(config: Config) {
         );
         pageCounter++;
 
-        const matchedPattern = config.match.find((match) => {
-          return minimatch(request.url, match.pattern);
-        })
-
-        // Use custom handling for XPath selector
-        if (matchedPattern && !matchedPattern.skip) {
-          const selector = matchedPattern?.selector || 'body';
-          if (selector.startsWith("/")) {
+        let globs: string | string[] = [];
+
+        if (PatternMatch.safeParse(config.match).success) {
+          const matchPattern = config.match as PatternMatchType;
+          globs = matchPattern.filter(s => !s.skip).map((s) => s.pattern);
+          const matchedPattern = matchPattern.find((match) => {
+            return minimatch(request.url, match.pattern);
+          });
+          if (matchedPattern && !matchedPattern.skip) {
+            const selector = matchedPattern?.selector || "body";
+            // Use custom handling for XPath selector
+            if (selector.startsWith("/")) {
+              await waitForXPath(
+                page,
+                selector,
+                config.waitForSelectorTimeout ?? 1000,
+              );
+            } else {
+              await page.waitForSelector(selector, {
+                timeout: config.waitForSelectorTimeout ?? 1000,
+              });
+            }
+            const html = await getPageHtml(page, selector);
+
+            // Save results as JSON to ./storage/datasets/default
+            await pushData({ title, url: request.loadedUrl, html });
+          }
+        } else if (
+          OriginMatch.safeParse(config.match).success &&
+          config.selector
+        ) {
+          const match = config.match as OriginMatchType;
+          globs = typeof match === "string" ? [match] : match;
+          // Use custom handling for XPath selector
+          if (config.selector.startsWith("/")) {
             await waitForXPath(
               page,
-              selector,
-              config.waitForSelectorTimeout ?? 1000
+              config.selector,
+              config.waitForSelectorTimeout ?? 1000,
             );
           } else {
-            await page.waitForSelector(selector, {
+            await page.waitForSelector(config.selector, {
               timeout: config.waitForSelectorTimeout ?? 1000,
             });
           }
-          const html = await getPageHtml(page, selector);
+          const html = await getPageHtml(page, config.selector);
 
           // Save results as JSON to ./storage/datasets/default
           await pushData({ title, url: request.loadedUrl, html });
-
         }
 
         if (config.onVisitPage) {
@@ -104,8 +130,7 @@ export async function crawl(config: Config) {
         // Extract links from the current page
         // and add them to the crawling queue.
         await enqueueLinks({
-          globs:
-            config.match.map(s => s.pattern),
+          globs,
         });
       },
       // Comment this option to scrape the full website.

diff --git a/src/index.ts b/src/index.ts
@@ -7,39 +7,44 @@ const githubConfigSchema = zod.object({
   githubRepoUrl: zod.string(),
   tag: zod.string().optional(),
   branch: zod.string().optional(),
+  skipFolders: zod.array(zod.string()).optional(),
 })
 
 export type GithubConfig = zod.infer<typeof githubConfigSchema>;
 
 
 export async function crawlerGithubForGPT(config: GithubConfig) {
-  const { githubRepoUrl, tag, branch } = githubConfigSchema.parse(config);
+  const { githubRepoUrl, tag, branch, skipFolders } = githubConfigSchema.parse(config);
 
   const path = tag || branch || 'master'
 
   const treeEndPointUrl = `${githubRepoUrl}/tree/${path}`
   const blobEndPointUrl = `${githubRepoUrl}/blob/${path}`
+
+  const match = [
+    {
+      // speical case for .md
+      // pattern: 'https://github.com/BuilderIO/gpt-crawler/blob/main/**/*.md',
+      pattern: `${blobEndPointUrl}/**/*.md`,
+      selector: '.markdown-body'
+    },
+    {
+      // other files like .js, .ts, .json, etc
+      pattern: `${blobEndPointUrl}/**`,
+      selector: '#read-only-cursor-text-area'
+    },
+    {
+      // skip the folder content
+      // pattern: "https://github.com/BuilderIO/gpt-crawler/tree/main/**",
+      pattern: `${treeEndPointUrl}/${skipFolders?.length ? `!(${skipFolders.join('|')})` : '**'}`,
+    },
+  ]
+
+  console.log(match)
+
   const innerConfig: Config = {
     url: treeEndPointUrl,
-    match: [
-      {
-        // skip the folder content
-        // pattern: "https://github.com/BuilderIO/gpt-crawler/tree/main/**",
-        pattern: `${treeEndPointUrl}/**`,
-        skip: true
-      },
-      {
-        // speical case for .md
-        // pattern: 'https://github.com/BuilderIO/gpt-crawler/blob/main/**/*.md',
-        pattern: `${blobEndPointUrl}/**/*.md`,
-        selector: '.markdown-body'
-      },
-      {
-        // other files like .js, .ts, .json, etc
-        pattern: `${blobEndPointUrl}/**`,
-        selector: '#read-only-cursor-text-area'
-      },
-    ],
+    match,
     // selector: '#read-only-cursor-text-area',
     maxPagesToCrawl: 500,
     waitForSelectorTimeout: 1000 * 5,

diff --git a/src/main.ts b/src/main.ts
@@ -1,6 +1,7 @@
 import { crawlerGithubForGPT } from "./index.js";
 
 crawlerGithubForGPT({
-  githubRepoUrl: 'https://github.com/BuilderIO/gpt-crawler',
-  branch: 'main'
+  githubRepoUrl: 'https://github.com/petyosi/react-virtuoso',
+  branch: 'master',
+  skipFolders: ['blog', 'docs', 'e2e', 'examples', 'site', 'docusaurus']
 })