refactor: refactor core functionality for improved readability

- Modify the `README.md` file: - Change the `match` property type to accept an array of strings. - Modify the `src/config.ts` file: - Change the `OriginMatch` property type to accept an array of strings. - Change the `PatternMatch` property type to accept an array of objects. - Modify the `src/core.ts` file: - Add import statements for `minimatch`, `Config`, `PatternMatch`, and `OriginMatch`. - Modify the `crawl` function: - Change the `globs` variable declaration to include a semicolon at the end. - Change the condition for checking `matchedPattern` to use the optional chaining operator. - Add a missing semicolon in the `page.waitForSelector` call. - Move the code inside the `else if` condition to a separate block for better readability. Signed-off-by: FTAndy <[email protected]>
FTAndy · Dec 8, 2023 · 9d184b9 · 9d184b9
1 parent f620176
commit 9d184b9
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -71,11 +71,14 @@ type Config = {
   /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */
   url: string;
   /** Pattern to match against for links on a page to subsequently crawl */
-  match: string | string[] | {
-    pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
-    selector?: string | undefined; // Selector to grab the inner text from
-    skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
-  }[];
+  match:
+    | string
+    | string[]
+    | {
+        pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
+        selector?: string | undefined; // Selector to grab the inner text from
+        skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
+      }[];
   /** Selector to grab the inner text from */
   selector: string;
   /** Don't crawl more than this many pages */

diff --git a/config.ts b/config.ts
@@ -5,4 +5,4 @@ export const defaultConfig: Config = {
   match: "https://www.builder.io/c/docs/**",
   maxPagesToCrawl: 50,
   outputFileName: "output.json",
-};
+};
diff --git a/src/config.ts b/src/config.ts
@@ -9,28 +9,30 @@ const Page: z.ZodType<Page> = z.any();
  * @example "https://www.builder.io/c/docs/**"
  * @default ""
  */
-export const OriginMatch = z.string().or(z.array(z.string()))
+export const OriginMatch = z.string().or(z.array(z.string()));
 
-export const PatternMatch = z.array(z.object({
-  /** 
-   * Pattern to match against for links on a page to subsequently crawl
-   * @example "https://www.builder.io/c/docs/**"
-   * @refer https://github.com/isaacs/minimatch
-   * @default ""
-   */
-  pattern: z.string(),
-  /**
-   * Selector to grab the inner text from, limited to pattern
-   * @example ".docs-builder-container"
-   * @default "body"
-   */
-  selector: z.string().optional(),
-  /**
-   * Skip to grap inner text for this pattern
-   * @default false
-   */
-  skip: z.boolean().optional()
-}))
+export const PatternMatch = z.array(
+  z.object({
+    /**
+     * Pattern to match against for links on a page to subsequently crawl
+     * @example "https://www.builder.io/c/docs/**"
+     * @refer https://github.com/isaacs/minimatch
+     * @default ""
+     */
+    pattern: z.string(),
+    /**
+     * Selector to grab the inner text from, limited to pattern
+     * @example ".docs-builder-container"
+     * @default "body"
+     */
+    selector: z.string().optional(),
+    /**
+     * Skip to grap inner text for this pattern
+     * @default false
+     */
+    skip: z.boolean().optional(),
+  }),
+);
 
 export const configSchema = z.object({
   /**

diff --git a/src/core.ts b/src/core.ts
@@ -2,8 +2,15 @@
 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { minimatch } from 'minimatch'
-import { Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType } from "./config.js";
+import { minimatch } from "minimatch";
+import {
+  Config,
+  configSchema,
+  PatternMatch,
+  PatternMatchType,
+  OriginMatch,
+  OriginMatchType,
+} from "./config.js";
 import { Page } from "playwright";
 import { isWithinTokenLimit } from "gpt-tokenizer";
 
@@ -72,36 +79,39 @@ export async function crawl(config: Config) {
           `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
         );
 
-        let globs: string | string[] = []
+        let globs: string | string[] = [];
 
         if (PatternMatch.safeParse(config.match).success) {
-          const matchPattern = config.match as PatternMatchType
-          globs = matchPattern.map(s => s.pattern)
+          const matchPattern = config.match as PatternMatchType;
+          globs = matchPattern.map((s) => s.pattern);
           const matchedPattern = matchPattern.find((match) => {
             return minimatch(request.url, match.pattern);
-          })
+          });
           if (matchedPattern && !matchedPattern.skip) {
-            const selector = matchedPattern?.selector || 'body';
+            const selector = matchedPattern?.selector || "body";
             // Use custom handling for XPath selector
             if (selector.startsWith("/")) {
               await waitForXPath(
                 page,
                 selector,
-                config.waitForSelectorTimeout ?? 1000
+                config.waitForSelectorTimeout ?? 1000,
               );
             } else {
               await page.waitForSelector(selector, {
                 timeout: config.waitForSelectorTimeout ?? 1000,
               });
             }
             const html = await getPageHtml(page, selector);
-          
+
             // Save results as JSON to ./storage/datasets/default
             await pushData({ title, url: request.loadedUrl, html });
           }
-        } else if (OriginMatch.safeParse(config.match).success && config.selector) {
-          const match = config.match as OriginMatchType
-          globs = typeof match === "string" ? [match] : match
+        } else if (
+          OriginMatch.safeParse(config.match).success &&
+          config.selector
+        ) {
+          const match = config.match as OriginMatchType;
+          globs = typeof match === "string" ? [match] : match;
           // Use custom handling for XPath selector
           if (config.selector.startsWith("/")) {
             await waitForXPath(
@@ -127,7 +137,7 @@ export async function crawl(config: Config) {
         // Extract links from the current page
         // and add them to the crawling queue.
         await enqueueLinks({
-          globs
+          globs,
         });
       },
       // Comment this option to scrape the full website.