From 9d184b9c7b1470abf7cea3e458335f219276c900 Mon Sep 17 00:00:00 2001 From: FTAndy Date: Fri, 8 Dec 2023 16:39:10 +0800 Subject: [PATCH] refactor: refactor core functionality for improved readability - Modify the `README.md` file: - Change the `match` property type to accept an array of strings. - Modify the `src/config.ts` file: - Change the `OriginMatch` property type to accept an array of strings. - Change the `PatternMatch` property type to accept an array of objects. - Modify the `src/core.ts` file: - Add import statements for `minimatch`, `Config`, `PatternMatch`, and `OriginMatch`. - Modify the `crawl` function: - Change the `globs` variable declaration to include a semicolon at the end. - Change the condition for checking `matchedPattern` to use the optional chaining operator. - Add a missing semicolon in the `page.waitForSelector` call. - Move the code inside the `else if` condition to a separate block for better readability. Signed-off-by: FTAndy --- README.md | 13 ++++++++----- config.ts | 2 +- src/config.ts | 44 +++++++++++++++++++++++--------------------- src/core.ts | 36 +++++++++++++++++++++++------------- 4 files changed, 55 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 55db7170..165bce8e 100644 --- a/README.md +++ b/README.md @@ -71,11 +71,14 @@ type Config = { /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ url: string; /** Pattern to match against for links on a page to subsequently crawl */ - match: string | string[] | { - pattern: string; // url glob expressions from https://github.com/isaacs/minimatch - selector?: string | undefined; // Selector to grab the inner text from - skip?: boolean | undefined; // Whether skip to not grab any content from this pattern - }[]; + match: + | string + | string[] + | { + pattern: string; // url glob expressions from https://github.com/isaacs/minimatch + selector?: string | undefined; // Selector to grab the inner text from + skip?: boolean | undefined; // Whether skip to not grab any content from this pattern + }[]; /** Selector to grab the inner text from */ selector: string; /** Don't crawl more than this many pages */ diff --git a/config.ts b/config.ts index 8dbe5516..bc2d22e0 100644 --- a/config.ts +++ b/config.ts @@ -5,4 +5,4 @@ export const defaultConfig: Config = { match: "https://www.builder.io/c/docs/**", maxPagesToCrawl: 50, outputFileName: "output.json", -}; \ No newline at end of file +}; diff --git a/src/config.ts b/src/config.ts index 04d5c327..2195a661 100644 --- a/src/config.ts +++ b/src/config.ts @@ -9,28 +9,30 @@ const Page: z.ZodType = z.any(); * @example "https://www.builder.io/c/docs/**" * @default "" */ -export const OriginMatch = z.string().or(z.array(z.string())) +export const OriginMatch = z.string().or(z.array(z.string())); -export const PatternMatch = z.array(z.object({ - /** - * Pattern to match against for links on a page to subsequently crawl - * @example "https://www.builder.io/c/docs/**" - * @refer https://github.com/isaacs/minimatch - * @default "" - */ - pattern: z.string(), - /** - * Selector to grab the inner text from, limited to pattern - * @example ".docs-builder-container" - * @default "body" - */ - selector: z.string().optional(), - /** - * Skip to grap inner text for this pattern - * @default false - */ - skip: z.boolean().optional() -})) +export const PatternMatch = z.array( + z.object({ + /** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @refer https://github.com/isaacs/minimatch + * @default "" + */ + pattern: z.string(), + /** + * Selector to grab the inner text from, limited to pattern + * @example ".docs-builder-container" + * @default "body" + */ + selector: z.string().optional(), + /** + * Skip to grap inner text for this pattern + * @default false + */ + skip: z.boolean().optional(), + }), +); export const configSchema = z.object({ /** diff --git a/src/core.ts b/src/core.ts index 7b13045a..150df333 100644 --- a/src/core.ts +++ b/src/core.ts @@ -2,8 +2,15 @@ import { PlaywrightCrawler, downloadListOfUrls } from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; -import { minimatch } from 'minimatch' -import { Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType } from "./config.js"; +import { minimatch } from "minimatch"; +import { + Config, + configSchema, + PatternMatch, + PatternMatchType, + OriginMatch, + OriginMatchType, +} from "./config.js"; import { Page } from "playwright"; import { isWithinTokenLimit } from "gpt-tokenizer"; @@ -72,22 +79,22 @@ export async function crawl(config: Config) { `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, ); - let globs: string | string[] = [] + let globs: string | string[] = []; if (PatternMatch.safeParse(config.match).success) { - const matchPattern = config.match as PatternMatchType - globs = matchPattern.map(s => s.pattern) + const matchPattern = config.match as PatternMatchType; + globs = matchPattern.map((s) => s.pattern); const matchedPattern = matchPattern.find((match) => { return minimatch(request.url, match.pattern); - }) + }); if (matchedPattern && !matchedPattern.skip) { - const selector = matchedPattern?.selector || 'body'; + const selector = matchedPattern?.selector || "body"; // Use custom handling for XPath selector if (selector.startsWith("/")) { await waitForXPath( page, selector, - config.waitForSelectorTimeout ?? 1000 + config.waitForSelectorTimeout ?? 1000, ); } else { await page.waitForSelector(selector, { @@ -95,13 +102,16 @@ export async function crawl(config: Config) { }); } const html = await getPageHtml(page, selector); - + // Save results as JSON to ./storage/datasets/default await pushData({ title, url: request.loadedUrl, html }); } - } else if (OriginMatch.safeParse(config.match).success && config.selector) { - const match = config.match as OriginMatchType - globs = typeof match === "string" ? [match] : match + } else if ( + OriginMatch.safeParse(config.match).success && + config.selector + ) { + const match = config.match as OriginMatchType; + globs = typeof match === "string" ? [match] : match; // Use custom handling for XPath selector if (config.selector.startsWith("/")) { await waitForXPath( @@ -127,7 +137,7 @@ export async function crawl(config: Config) { // Extract links from the current page // and add them to the crawling queue. await enqueueLinks({ - globs + globs, }); }, // Comment this option to scrape the full website.