diff --git a/examples/quick-start/packages/ingest/src/ingest.config.ts b/examples/quick-start/packages/ingest/src/ingest.config.ts index ee3bce720..b6e1ca920 100644 --- a/examples/quick-start/packages/ingest/src/ingest.config.ts +++ b/examples/quick-start/packages/ingest/src/ingest.config.ts @@ -56,4 +56,10 @@ export default { return [mongodbChatbotFrameworkSource]; }, + concurrencyOptions: () => ({ + embed: { + createChunks: 5, + processPages: 2, + }, + }), } satisfies Config; diff --git a/packages/ingest-mongodb-public/src/sources/snooty/SnootyDataSource.ts b/packages/ingest-mongodb-public/src/sources/snooty/SnootyDataSource.ts index c232574f3..5e8fd5404 100644 --- a/packages/ingest-mongodb-public/src/sources/snooty/SnootyDataSource.ts +++ b/packages/ingest-mongodb-public/src/sources/snooty/SnootyDataSource.ts @@ -2,7 +2,11 @@ import { createInterface } from "readline"; import { Page, PageFormat, logger } from "mongodb-rag-core"; import fetch from "node-fetch"; import { DataSource, ProjectBase } from "mongodb-rag-core/dataSources"; -import { snootyAstToMd, getTitleFromSnootyAst } from "./snootyAstToMd"; +import { + snootyAstToMd, + getTitleFromSnootyAst, + getMetadataFromSnootyAst, +} from "./snootyAstToMd"; import { getTitleFromSnootyOpenApiSpecAst, snootyAstToOpenApiSpec, @@ -49,6 +53,35 @@ export type SnootyTextNode = SnootyNode & { value: string; }; +export type SnootyFacetNode = SnootyNode & { + type: "directive"; + name: "facet"; + children: never; + options?: { + name: string; + values: string; + }; +}; + +export type SnootyMetaNode = SnootyNode & { + type: "directive"; + name: "meta"; + children: never; + options?: { + /** + List of relevant keywords for the page, comma separated. + @example "code example, node.js, analyze, array" + */ + keywords?: string; + + /** + High-level description of the page. + */ + description: string; + [key: string]: string | undefined; + }; +}; + /** A page in the Snooty manifest. */ @@ -328,6 +361,7 @@ export const handlePage = async ( body = snootyAstToMd(page.ast); title = getTitleFromSnootyAst(page.ast); } + const metadata = getMetadataFromSnootyAst(page.ast); return { url: new URL(pagePath, baseUrl.replace(/\/?$/, "/")).href.replace( @@ -339,6 +373,7 @@ export const handlePage = async ( body, format, metadata: { + ...metadata, tags, productName, version, diff --git a/packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.test.ts b/packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.test.ts index 5b70dd99d..a0c5c1f40 100644 --- a/packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.test.ts +++ b/packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.test.ts @@ -1,6 +1,10 @@ import Path from "path"; import fs from "fs"; -import { snootyAstToMd, getTitleFromSnootyAst } from "./snootyAstToMd"; +import { + snootyAstToMd, + getTitleFromSnootyAst, + getMetadataFromSnootyAst, +} from "./snootyAstToMd"; import { SnootyNode } from "./SnootyDataSource"; import { rstToSnootyAst } from "./rstToSnootyAst"; @@ -300,3 +304,33 @@ describe("getTitleFromSnootyAst", () => { ); }); }); + +describe("getMetadataFromSnootyAst", () => { + const sampleMetadataPage = JSON.parse( + fs.readFileSync( + Path.resolve(SRC_ROOT, "../testData/samplePageWithMetadata.json"), + { + encoding: "utf-8", + } + ) + ); + it("extracts meta directives", () => { + const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast); + expect(metadata).toMatchObject({ + pageDescription: expect.any(String), + }); + }); + it("extracts meta.keyword directives as string[]", () => { + const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast); + expect(metadata).toMatchObject({ + pageKeywords: expect.arrayContaining([expect.any(String)]), + }); + }); + it("extracts facet directives", () => { + const metadata = getMetadataFromSnootyAst(sampleMetadataPage.data.ast); + expect(metadata).toMatchObject({ + pageGenre: "tutorial", + pageFoo: "bar", + }); + }); +}); diff --git a/packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.ts b/packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.ts index 2855bdc55..e0e929e67 100644 --- a/packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.ts +++ b/packages/ingest-mongodb-public/src/sources/snooty/snootyAstToMd.ts @@ -1,4 +1,9 @@ -import { SnootyNode, SnootyTextNode } from "./SnootyDataSource"; +import { + SnootyFacetNode, + SnootyMetaNode, + SnootyNode, + SnootyTextNode, +} from "./SnootyDataSource"; import { strict as assert } from "assert"; import { renderSnootyTable } from "./renderSnootyTable"; @@ -204,3 +209,59 @@ export const getTitleFromSnootyAst = (node: SnootyNode): string | undefined => { ) as SnootyTextNode[]; return textNodes.map(({ value }) => value).join(""); }; + +export const getMetadataFromSnootyAst = ( + node: SnootyNode +): Record => { + const facetAndMetaNodes = findAll( + node, + ({ name }) => name === "facet" || name === "meta" + ) as (SnootyFacetNode | SnootyMetaNode)[]; + + const facetNodes = facetAndMetaNodes.filter( + (n) => n.name === "facet" + ) as SnootyFacetNode[]; + const metaNodes = facetAndMetaNodes.filter( + (n) => n.name === "meta" + ) as SnootyMetaNode[]; + + const keyPrefix = "page"; + + const facets = facetNodes.reduce((acc, facetNode) => { + if (!facetNode.options) { + return acc; + } + const { name, values } = facetNode.options; + if (!name || !values) { + return acc; + } + acc[createKeyName(name, keyPrefix)] = values; + return acc; + }, {} as Record); + + const meta = metaNodes.reduce((acc, metaNode) => { + if (!metaNode.options) { + return acc; + } + const metaEntries = Object.entries(metaNode.options); + for (const [key, value] of metaEntries) { + if (key === "keywords" && value) { + acc[createKeyName(key, keyPrefix)] = value + .split(",") + .map((s) => s.trim()); + } else if (key === "description" && value) { + acc[createKeyName(key, keyPrefix)] = value; + } + } + + return acc; + }, {} as Record); + return { + ...facets, + ...meta, + }; +}; + +function createKeyName(key: string, prefix = "") { + return prefix + key.charAt(0).toUpperCase() + key.slice(1); +} diff --git a/packages/ingest-mongodb-public/testData/samplePageWithMetadata.json b/packages/ingest-mongodb-public/testData/samplePageWithMetadata.json new file mode 100644 index 000000000..a226d908e --- /dev/null +++ b/packages/ingest-mongodb-public/testData/samplePageWithMetadata.json @@ -0,0 +1,254 @@ +{ + "type": "page", + "data": { + "_id": "6579cd0d7a5cb5fef5bb7a91", + "github_username": "docs-builder-bot", + "page_id": "node/docsworker-xlarge/master/aggregation-tutorials/unpack-arrays", + "ast": { + "type": "root", + "position": { "start": { "line": 0 } }, + "children": [ + { + "type": "target", + "position": { "start": { "line": 0 } }, + "children": [ + { + "type": "target_identifier", + "position": { "start": { "line": 0 } }, + "children": [ + { + "type": "text", + "position": { "start": { "line": 4 } }, + "value": "Unpack Arrays and Group" + } + ], + "ids": ["node-aggregation-arrays"] + } + ], + "domain": "std", + "name": "label", + "html_id": "std-label-node-aggregation-arrays" + }, + { + "type": "section", + "position": { "start": { "line": 4 } }, + "children": [ + { + "type": "heading", + "position": { "start": { "line": 4 } }, + "children": [ + { + "type": "text", + "position": { "start": { "line": 4 } }, + "value": "Unpack Arrays and Group" + } + ], + "id": "unpack-arrays-and-group" + }, + { + "type": "directive", + "position": { "start": { "line": 6 } }, + "children": [], + "domain": "", + "name": "contents", + "argument": [ + { + "type": "text", + "position": { "start": { "line": 6 } }, + "value": "On this page" + } + ], + "options": { + "local": true, + "backlinks": "none", + "depth": 2, + "class": "singlecol" + } + }, + { + "type": "directive", + "position": { "start": { "line": 12 } }, + "children": [], + "domain": "mongodb", + "name": "facet", + "argument": [], + "options": { "name": "genre", "values": "tutorial" } + }, + { + "type": "directive", + "position": { "start": { "line": 12 } }, + "children": [], + "domain": "mongodb", + "name": "facet", + "argument": [], + "options": { "name": "foo", "values": "bar" } + }, + { + "type": "directive", + "position": { "start": { "line": 16 } }, + "children": [], + "domain": "", + "name": "meta", + "argument": [], + "options": { "keywords": "code example, node.js, analyze, array" } + }, + { + "type": "directive", + "position": { "start": { "line": 16 } }, + "children": [], + "domain": "", + "name": "meta", + "argument": [], + "options": { + "description": "I love unpacking arrays soooo much!" + } + } + ] + } + ], + "fileid": "aggregation-tutorials/unpack-arrays.txt", + "options": { + "headings": [ + { + "depth": 2, + "id": "introduction", + "title": [ + { + "type": "text", + "position": { "start": { "line": 20 } }, + "value": "Introduction" + } + ], + "selector_ids": {} + }, + { + "depth": 3, + "id": "aggregation-task-summary", + "title": [ + { + "type": "text", + "position": { "start": { "line": 33 } }, + "value": "Aggregation Task Summary" + } + ], + "selector_ids": {} + }, + { + "depth": 2, + "id": "before-you-get-started", + "title": [ + { + "type": "text", + "position": { "start": { "line": 46 } }, + "value": "Before You Get Started" + } + ], + "selector_ids": {} + }, + { + "depth": 2, + "id": "tutorial", + "title": [ + { + "type": "text", + "position": { "start": { "line": 73 } }, + "value": "Tutorial" + } + ], + "selector_ids": {} + }, + { + "depth": 3, + "id": "add-an-unwind-stage-to-unpack-the-array-of-product-orders", + "title": [ + { + "type": "text", + "position": { "start": { "line": 78 } }, + "value": "Add an unwind stage to unpack the array of product orders" + } + ], + "selector_ids": {} + }, + { + "depth": 3, + "id": "add-a-match-stage-for-products-that-cost-more-than--15", + "title": [ + { + "type": "text", + "position": { "start": { "line": 91 } }, + "value": "Add a match stage for products that cost more than $15" + } + ], + "selector_ids": {} + }, + { + "depth": 3, + "id": "add-a-group-stage-to-group-by-product-type", + "title": [ + { + "type": "text", + "position": { "start": { "line": 104 } }, + "value": "Add a group stage to group by product type" + } + ], + "selector_ids": {} + }, + { + "depth": 3, + "id": "add-a-set-stage-to-display-the-product-id", + "title": [ + { + "type": "text", + "position": { "start": { "line": 123 } }, + "value": "Add a set stage to display the product ID" + } + ], + "selector_ids": {} + }, + { + "depth": 3, + "id": "add-an-unset-stage-to-remove-unneeded-fields", + "title": [ + { + "type": "text", + "position": { "start": { "line": 137 } }, + "value": "Add an unset stage to remove unneeded fields" + } + ], + "selector_ids": {} + }, + { + "depth": 3, + "id": "run-the-aggregation-pipeline", + "title": [ + { + "type": "text", + "position": { "start": { "line": 151 } }, + "value": "Run the aggregation pipeline" + } + ], + "selector_ids": {} + }, + { + "depth": 3, + "id": "interpret-results", + "title": [ + { + "type": "text", + "position": { "start": { "line": 170 } }, + "value": "Interpret results" + } + ], + "selector_ids": {} + } + ] + } + }, + "build_id": "6717c80e760ab774664f96f0", + "created_at": "2023-12-13T15:26:05.172Z", + "deleted": false, + "filename": "aggregation-tutorials/unpack-arrays.txt", + "static_assets": [], + "updated_at": "2024-10-22T15:45:27.716Z" + } +}