From 9d184b9c7b1470abf7cea3e458335f219276c900 Mon Sep 17 00:00:00 2001
From: FTAndy <fortheandy@gmail.com>
Date: Fri, 8 Dec 2023 16:39:10 +0800
Subject: [PATCH] refactor: refactor core functionality for improved
 readability

- Modify the `README.md` file:
  - Change the `match` property type to accept an array of strings.
- Modify the `src/config.ts` file:
  - Change the `OriginMatch` property type to accept an array of strings.
  - Change the `PatternMatch` property type to accept an array of objects.
- Modify the `src/core.ts` file:
  - Add import statements for `minimatch`, `Config`, `PatternMatch`, and `OriginMatch`.
  - Modify the `crawl` function:
    - Change the `globs` variable declaration to include a semicolon at the end.
    - Change the condition for checking `matchedPattern` to use the optional chaining operator.
    - Add a missing semicolon in the `page.waitForSelector` call.
    - Move the code inside the `else if` condition to a separate block for better readability.

Signed-off-by: FTAndy <fortheandy@gmail.com>
---
 README.md     | 13 ++++++++-----
 config.ts     |  2 +-
 src/config.ts | 44 +++++++++++++++++++++++---------------------
 src/core.ts   | 36 +++++++++++++++++++++++-------------
 4 files changed, 55 insertions(+), 40 deletions(-)
diff --git a/README.md b/README.md
index 55db7170..165bce8e 100644
--- a/README.md
+++ b/README.md
@@ -71,11 +71,14 @@ type Config = {
   /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */
   url: string;
   /** Pattern to match against for links on a page to subsequently crawl */
-  match: string | string[] | {
-    pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
-    selector?: string | undefined; // Selector to grab the inner text from
-    skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
-  }[];
+  match:
+    | string
+    | string[]
+    | {
+        pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
+        selector?: string | undefined; // Selector to grab the inner text from
+        skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
+      }[];
   /** Selector to grab the inner text from */
   selector: string;
   /** Don't crawl more than this many pages */
diff --git a/config.ts b/config.ts
index 8dbe5516..bc2d22e0 100644
--- a/config.ts
+++ b/config.ts
@@ -5,4 +5,4 @@ export const defaultConfig: Config = {
   match: "https://www.builder.io/c/docs/**",
   maxPagesToCrawl: 50,
   outputFileName: "output.json",
-};
\ No newline at end of file
+};
diff --git a/src/config.ts b/src/config.ts
index 04d5c327..2195a661 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -9,28 +9,30 @@ const Page: z.ZodType<Page> = z.any();
  * @example "https://www.builder.io/c/docs/**"
  * @default ""
  */
-export const OriginMatch = z.string().or(z.array(z.string()))
+export const OriginMatch = z.string().or(z.array(z.string()));
 
-export const PatternMatch = z.array(z.object({
-  /** 
-   * Pattern to match against for links on a page to subsequently crawl
-   * @example "https://www.builder.io/c/docs/**"
-   * @refer https://github.com/isaacs/minimatch
-   * @default ""
-   */
-  pattern: z.string(),
-  /**
-   * Selector to grab the inner text from, limited to pattern
-   * @example ".docs-builder-container"
-   * @default "body"
-   */
-  selector: z.string().optional(),
-  /**
-   * Skip to grap inner text for this pattern
-   * @default false
-   */
-  skip: z.boolean().optional()
-}))
+export const PatternMatch = z.array(
+  z.object({
+    /**
+     * Pattern to match against for links on a page to subsequently crawl
+     * @example "https://www.builder.io/c/docs/**"
+     * @refer https://github.com/isaacs/minimatch
+     * @default ""
+     */
+    pattern: z.string(),
+    /**
+     * Selector to grab the inner text from, limited to pattern
+     * @example ".docs-builder-container"
+     * @default "body"
+     */
+    selector: z.string().optional(),
+    /**
+     * Skip to grap inner text for this pattern
+     * @default false
+     */
+    skip: z.boolean().optional(),
+  }),
+);
 
 export const configSchema = z.object({
   /**
diff --git a/src/core.ts b/src/core.ts
index 7b13045a..150df333 100644
--- a/src/core.ts
+++ b/src/core.ts
@@ -2,8 +2,15 @@
 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { minimatch } from 'minimatch'
-import { Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType } from "./config.js";
+import { minimatch } from "minimatch";
+import {
+  Config,
+  configSchema,
+  PatternMatch,
+  PatternMatchType,
+  OriginMatch,
+  OriginMatchType,
+} from "./config.js";
 import { Page } from "playwright";
 import { isWithinTokenLimit } from "gpt-tokenizer";
 
@@ -72,22 +79,22 @@ export async function crawl(config: Config) {
           `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
         );
 
-        let globs: string | string[] = []
+        let globs: string | string[] = [];
 
         if (PatternMatch.safeParse(config.match).success) {
-          const matchPattern = config.match as PatternMatchType
-          globs = matchPattern.map(s => s.pattern)
+          const matchPattern = config.match as PatternMatchType;
+          globs = matchPattern.map((s) => s.pattern);
           const matchedPattern = matchPattern.find((match) => {
             return minimatch(request.url, match.pattern);
-          })
+          });
           if (matchedPattern && !matchedPattern.skip) {
-            const selector = matchedPattern?.selector || 'body';
+            const selector = matchedPattern?.selector || "body";
             // Use custom handling for XPath selector
             if (selector.startsWith("/")) {
               await waitForXPath(
                 page,
                 selector,
-                config.waitForSelectorTimeout ?? 1000
+                config.waitForSelectorTimeout ?? 1000,
               );
             } else {
               await page.waitForSelector(selector, {
@@ -95,13 +102,16 @@ export async function crawl(config: Config) {
               });
             }
             const html = await getPageHtml(page, selector);
-          
+
             // Save results as JSON to ./storage/datasets/default
             await pushData({ title, url: request.loadedUrl, html });
           }
-        } else if (OriginMatch.safeParse(config.match).success && config.selector) {
-          const match = config.match as OriginMatchType
-          globs = typeof match === "string" ? [match] : match
+        } else if (
+          OriginMatch.safeParse(config.match).success &&
+          config.selector
+        ) {
+          const match = config.match as OriginMatchType;
+          globs = typeof match === "string" ? [match] : match;
           // Use custom handling for XPath selector
           if (config.selector.startsWith("/")) {
             await waitForXPath(
@@ -127,7 +137,7 @@ export async function crawl(config: Config) {
         // Extract links from the current page
         // and add them to the crawling queue.
         await enqueueLinks({
-          globs
+          globs,
         });
       },
       // Comment this option to scrape the full website.