Skip to content

Commit

Permalink
refactor: refactor core functionality for improved readability
Browse files Browse the repository at this point in the history
- Modify the `README.md` file:
  - Change the `match` property type to accept an array of strings.
- Modify the `src/config.ts` file:
  - Change the `OriginMatch` property type to accept an array of strings.
  - Change the `PatternMatch` property type to accept an array of objects.
- Modify the `src/core.ts` file:
  - Add import statements for `minimatch`, `Config`, `PatternMatch`, and `OriginMatch`.
  - Modify the `crawl` function:
    - Change the `globs` variable declaration to include a semicolon at the end.
    - Change the condition for checking `matchedPattern` to use the optional chaining operator.
    - Add a missing semicolon in the `page.waitForSelector` call.
    - Move the code inside the `else if` condition to a separate block for better readability.

Signed-off-by: FTAndy <[email protected]>
  • Loading branch information
FTAndy committed Dec 8, 2023
1 parent f620176 commit 9d184b9
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 40 deletions.
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,14 @@ type Config = {
/** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */
url: string;
/** Pattern to match against for links on a page to subsequently crawl */
match: string | string[] | {
pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
selector?: string | undefined; // Selector to grab the inner text from
skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
}[];
match:
| string
| string[]
| {
pattern: string; // url glob expressions from https://github.com/isaacs/minimatch
selector?: string | undefined; // Selector to grab the inner text from
skip?: boolean | undefined; // Whether skip to not grab any content from this pattern
}[];
/** Selector to grab the inner text from */
selector: string;
/** Don't crawl more than this many pages */
Expand Down
2 changes: 1 addition & 1 deletion config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ export const defaultConfig: Config = {
match: "https://www.builder.io/c/docs/**",
maxPagesToCrawl: 50,
outputFileName: "output.json",
};
};
44 changes: 23 additions & 21 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,30 @@ const Page: z.ZodType<Page> = z.any();
* @example "https://www.builder.io/c/docs/**"
* @default ""
*/
export const OriginMatch = z.string().or(z.array(z.string()))
export const OriginMatch = z.string().or(z.array(z.string()));

export const PatternMatch = z.array(z.object({
/**
* Pattern to match against for links on a page to subsequently crawl
* @example "https://www.builder.io/c/docs/**"
* @refer https://github.com/isaacs/minimatch
* @default ""
*/
pattern: z.string(),
/**
* Selector to grab the inner text from, limited to pattern
* @example ".docs-builder-container"
* @default "body"
*/
selector: z.string().optional(),
/**
* Skip to grap inner text for this pattern
* @default false
*/
skip: z.boolean().optional()
}))
export const PatternMatch = z.array(
z.object({
/**
* Pattern to match against for links on a page to subsequently crawl
* @example "https://www.builder.io/c/docs/**"
* @refer https://github.com/isaacs/minimatch
* @default ""
*/
pattern: z.string(),
/**
* Selector to grab the inner text from, limited to pattern
* @example ".docs-builder-container"
* @default "body"
*/
selector: z.string().optional(),
/**
* Skip to grap inner text for this pattern
* @default false
*/
skip: z.boolean().optional(),
}),
);

export const configSchema = z.object({
/**
Expand Down
36 changes: 23 additions & 13 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@
import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import { minimatch } from 'minimatch'
import { Config, configSchema, PatternMatch, PatternMatchType, OriginMatch, OriginMatchType } from "./config.js";
import { minimatch } from "minimatch";
import {
Config,
configSchema,
PatternMatch,
PatternMatchType,
OriginMatch,
OriginMatchType,
} from "./config.js";
import { Page } from "playwright";
import { isWithinTokenLimit } from "gpt-tokenizer";

Expand Down Expand Up @@ -72,36 +79,39 @@ export async function crawl(config: Config) {
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
);

let globs: string | string[] = []
let globs: string | string[] = [];

if (PatternMatch.safeParse(config.match).success) {
const matchPattern = config.match as PatternMatchType
globs = matchPattern.map(s => s.pattern)
const matchPattern = config.match as PatternMatchType;
globs = matchPattern.map((s) => s.pattern);
const matchedPattern = matchPattern.find((match) => {
return minimatch(request.url, match.pattern);
})
});
if (matchedPattern && !matchedPattern.skip) {
const selector = matchedPattern?.selector || 'body';
const selector = matchedPattern?.selector || "body";
// Use custom handling for XPath selector
if (selector.startsWith("/")) {
await waitForXPath(
page,
selector,
config.waitForSelectorTimeout ?? 1000
config.waitForSelectorTimeout ?? 1000,
);
} else {
await page.waitForSelector(selector, {
timeout: config.waitForSelectorTimeout ?? 1000,
});
}
const html = await getPageHtml(page, selector);

// Save results as JSON to ./storage/datasets/default
await pushData({ title, url: request.loadedUrl, html });
}
} else if (OriginMatch.safeParse(config.match).success && config.selector) {
const match = config.match as OriginMatchType
globs = typeof match === "string" ? [match] : match
} else if (
OriginMatch.safeParse(config.match).success &&
config.selector
) {
const match = config.match as OriginMatchType;
globs = typeof match === "string" ? [match] : match;
// Use custom handling for XPath selector
if (config.selector.startsWith("/")) {
await waitForXPath(
Expand All @@ -127,7 +137,7 @@ export async function crawl(config: Config) {
// Extract links from the current page
// and add them to the crawling queue.
await enqueueLinks({
globs
globs,
});
},
// Comment this option to scrape the full website.
Expand Down

0 comments on commit 9d184b9

Please sign in to comment.