Skip to content

Commit ef110d4

Browse files
Joaquín R. Montesguimachiavelli
andauthored
[MEI-23]: Review broken links checker (#2533)
* feat: split checker for local usage * chore: add example broken link to trigger action * fix: use type:module again * nits * test: es2015 target * test: with --input-type=module * test: compile checker every time * fixes and main script * chore: improve scripts naming * fix: use slugger locally * Update .vscode/settings.json * fix final broken links (#2534) * fix: empty type in local table --------- Co-authored-by: gui machiavelli <[email protected]>
1 parent bd79848 commit ef110d4

File tree

15 files changed

+482
-338
lines changed

15 files changed

+482
-338
lines changed

.github/actions/validate-docs-links/README.MD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# GitHub Action for Validating Documentation Links
22

3-
This action ensures that internal links in `.mdx` files in the `/docs/` directory are valid. It runs on every pull request that includes changes to these files.
3+
This action ensures that internal links in `.mdx` files are valid. It runs on every pull request that includes changes to these files and target `main` branch.
44

55
The action is triggered by the workflow defined in `.github/workflows/validate-docs-links.yml`.
66

.github/actions/validate-docs-links/lib/index.js

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.github/actions/validate-docs-links/package.json

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,27 @@
11
{
22
"private": true,
3-
"type": "module",
43
"exports": "./lib/index.js",
54
"files": [
65
"src"
76
],
7+
"type": "module",
88
"scripts": {
9-
"build": "npm run types && ncc -m -o ./lib build src/index.ts --license licenses.txt",
9+
"local:build": "yarn run types && ncc -m -o ./dist build src/checker.ts",
10+
"local:run-checker": "node dist --run-local-checker",
11+
"local:check": "yarn local:build && yarn local:run-checker",
12+
"action:build": "yarn run types && ncc -m -o ./lib build src/index.ts --license licenses.txt",
13+
"build": "yarn action:build",
1014
"types": "tsc"
1115
},
1216
"devDependencies": {
1317
"@types/github-slugger": "^1.3.0",
18+
"@types/node": "^20.4.9",
19+
"ts-node": "^10.9.1",
1420
"@vercel/ncc": "0.34.0"
1521
},
1622
"dependencies": {
1723
"@actions/core": "^1.10.0",
1824
"@actions/github": "^5.1.1",
19-
"@types/node": "^20.4.1",
2025
"github-slugger": "1.2.0",
2126
"gray-matter": "4.0.2",
2227
"rehype-raw": "4.0.1",
Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
const fs = require('fs/promises')
2+
const path = require('path')
3+
const unified = require('unified')
4+
const markdown = require('remark-parse')
5+
const remarkToRehype = require('remark-rehype')
6+
const raw = require('rehype-raw')
7+
const visit = require('unist-util-visit')
8+
const matter = require('gray-matter')
9+
const GithubSlugger = require('github-slugger')
10+
import type { Node, Data } from 'unist'
11+
/**
12+
* This script validates internal links in /docs including internal,
13+
* hash, source and related links. It does not validate external links.
14+
* 1. Collects all .mdx files.
15+
* 2. For each file, it extracts the content, metadata, and heading slugs.
16+
* 3. It creates a document map to efficiently lookup documents by path.
17+
* 4. It then traverses each document modified in the PR and...
18+
* - Checks if each internal link points
19+
* to an existing document
20+
* - Validates hash links (links starting with "#") against the list of
21+
* headings in the current document.
22+
* - Checks the source and related links found in the metadata of each
23+
* document.
24+
* 5. Any broken links discovered during these checks are categorized and a
25+
* comment is added to the PR.
26+
*/
27+
28+
interface Document {
29+
body: string
30+
path: string
31+
slug: string
32+
headings: string[]
33+
sidebarDepth?: number
34+
}
35+
36+
interface Errors {
37+
doc: Document
38+
link: string[]
39+
hash: string[]
40+
source: string[]
41+
related: string[]
42+
}
43+
44+
type ErrorType = Exclude<keyof Errors, 'doc'>
45+
46+
/** Label: Is the text that will be displayed in the sidebar */
47+
type RouteSchema = {source: string, label: string, slug: string}
48+
49+
/**
50+
* Source: Is the path to the .mdx file
51+
*
52+
* Slug: Is the route we will use to access the page in the browser
53+
*/
54+
type RouteFragment = Omit<RouteSchema, 'label'>
55+
56+
type FooterConfigSchema = Array<RouteSchema |
57+
{
58+
"to": string,
59+
"label": string,
60+
"slug": string
61+
}>
62+
63+
type ConfigSchema = Array<{title: string, slug: string, routes: Array<RouteSchema>}>
64+
type FailureFunction = (message: string) => void
65+
66+
const RELATIVE_PATH = '/'
67+
const EXCLUDED_HASHES: string[] = []
68+
69+
const slugger = new GithubSlugger()
70+
71+
// Collect the paths of all .mdx files present in the config files
72+
async function getAllMdxFilePaths(basePath: string): Promise<RouteFragment[]> {
73+
const sidebarLearn: ConfigSchema = JSON.parse(await fs.readFile(path.join(basePath, 'config/sidebar-learn.json'), 'utf8'))
74+
const sidebarReference: ConfigSchema = JSON.parse(await fs.readFile(path.join(basePath, 'config/sidebar-reference.json'), 'utf8'))
75+
const footer: FooterConfigSchema = JSON.parse(await fs.readFile(path.join(basePath, 'config/sidebar-footer.json'), 'utf8'))
76+
77+
const config = [
78+
...sidebarLearn.map(group => ({...group, slug: path.join('learn', group.slug)})),
79+
...sidebarReference.map(group => ({...group, slug: path.join('reference/', group.slug)}))
80+
]
81+
82+
let allRoutes: RouteSchema[] = [{source: path.join(basePath, 'home.mdx'), slug: '', label: 'Homepage'}]
83+
for (const group of config) {
84+
allRoutes = allRoutes.concat(group.routes.map(route => ({
85+
...route,
86+
slug: path.join(group.slug, route.slug),
87+
source: path.join(basePath, route.source)
88+
})))
89+
}
90+
footer.forEach(item => 'source' in item && allRoutes.push({...item, source: path.join(basePath, item.source)}))
91+
92+
return allRoutes
93+
}
94+
95+
// Returns the slugs of all headings in a tree
96+
function getHeadingsFromMarkdownTree(tree: Node<Data>): string[] {
97+
const headings: string[] = []
98+
slugger.reset()
99+
100+
visit(tree, 'heading', (node: Node<Data>) => {
101+
let headingText = ''
102+
// Account for headings with inline code blocks by concatenating the
103+
// text values of all children of a heading node.
104+
visit(node, (node: any) => {
105+
if (node.value) {
106+
headingText += node.value
107+
}
108+
})
109+
headings.push(slugger.slug(headingText))
110+
})
111+
112+
return headings
113+
}
114+
115+
// Create a processor to parse MDX content
116+
const markdownProcessor = unified()
117+
.use(markdown)
118+
.use(remarkToRehype, { allowDangerousHTML: true })
119+
.use(raw)
120+
.use(function compiler() {
121+
// A compiler is required, and we only need the AST, so we can
122+
// just return it.
123+
// @ts-ignore
124+
this.Compiler = function treeCompiler(tree) {
125+
return tree
126+
}
127+
})
128+
129+
// use Map for faster lookup
130+
let documentMap: Map<string, Document>
131+
132+
// Create a map of documents with their paths as keys and
133+
// document content and metadata as values
134+
// The key varies between doc pages and error pages
135+
// error pages: `/docs/messages/example`
136+
// doc pages: `api/example`
137+
async function prepareDocumentMapEntry(
138+
route: RouteFragment,
139+
setFailed: FailureFunction
140+
): Promise<[string, Document]> {
141+
try {
142+
const mdxContent = await fs.readFile(route.source, 'utf8')
143+
const { content, data } = matter(mdxContent)
144+
const tree = markdownProcessor.parse(content)
145+
const headings = getHeadingsFromMarkdownTree(tree)
146+
147+
return [
148+
route.slug,
149+
{ body: content, path: route.source, slug: route.slug, headings, ...data },
150+
]
151+
} catch (error) {
152+
setFailed(`Error preparing document map for file ${route}: ${error}`)
153+
return ['', {} as Document]
154+
}
155+
}
156+
157+
// Checks if the links point to existing documents
158+
function validateInternalLink(errors: Errors, href: string): void {
159+
// /docs/api/example#heading -> ["api/example", "heading""]
160+
const [link, hash] = href.split('#')
161+
162+
// check if doc page exists
163+
const foundPage = documentMap.get(link.replace(/^\/+/, ''))
164+
165+
166+
if (!foundPage) {
167+
errors.link.push(href)
168+
} else if (hash && !EXCLUDED_HASHES.includes(hash)) {
169+
// TODO: Check if this block is still needed
170+
// // Account for documents that pull their content from another document
171+
// const foundPageSource = foundPage.source
172+
// ? documentMap.get(foundPage.source)
173+
// : undefined
174+
175+
// Check if the hash link points to an existing section within the document
176+
// const hashFound = (foundPageSource || foundPage).headings.includes(hash)
177+
const hashFound = foundPage.headings.includes(hash)
178+
179+
if (!hashFound) {
180+
errors.hash.push(href)
181+
}
182+
}
183+
}
184+
185+
// Checks if the hash links point to existing sections within the same document
186+
function validateHashLink(errors: Errors, href: string, doc: Document): void {
187+
const hashLink = href.replace('#', '')
188+
189+
if (!EXCLUDED_HASHES.includes(hashLink) && !doc.headings.includes(hashLink)) {
190+
errors.hash.push(href)
191+
}
192+
}
193+
194+
// Checks if the source link points to an existing document
195+
function validateSourceLinks(doc: Document, errors: Errors): void {
196+
if (doc.slug && !documentMap.get(doc.slug)) {
197+
errors.source.push(doc.path)
198+
}
199+
}
200+
201+
// Traverse the document tree and validate links
202+
function traverseTreeAndValidateLinks(tree: any, doc: Document, setFailed: FailureFunction): Errors {
203+
const errors: Errors = {
204+
doc,
205+
link: [],
206+
hash: [],
207+
source: [],
208+
related: [],
209+
}
210+
211+
try {
212+
visit(tree, (node: any) => {
213+
if (node.type === 'element' && node.tagName === 'a') {
214+
const href = node.properties.href
215+
216+
if (!href) return
217+
218+
if (href.startsWith(RELATIVE_PATH)) {
219+
validateInternalLink(errors, href)
220+
} else if (href.startsWith('#')) {
221+
validateHashLink(errors, href, doc)
222+
}
223+
}
224+
})
225+
226+
validateSourceLinks(doc, errors)
227+
} catch (error) {
228+
setFailed('Error traversing tree: ' + error)
229+
}
230+
231+
return errors
232+
}
233+
234+
const formatTableRow = (
235+
link: string,
236+
errorType: ErrorType,
237+
docPath: string,
238+
sha?: string
239+
) => {
240+
if (process.argv[2] === '--run-local-checker') return `| ${link} | ${errorType} | /${docPath} | \n`
241+
return `| ${link} | ${errorType} | [/${docPath}](https://github.com/meilisearch/documentation/blob/${sha}/${docPath}) | \n`
242+
}
243+
244+
// Main function that triggers link validation across .mdx files
245+
export async function validateAllInternalLinks(basePath: string, setFailed: FailureFunction, sha?: string, useComment?: (comment: string, errorsExist: boolean) => Promise<void>): Promise<void> {
246+
try {
247+
const allMdxFilePaths = await getAllMdxFilePaths(basePath)
248+
249+
documentMap = new Map(
250+
await Promise.all(allMdxFilePaths.map(route => prepareDocumentMapEntry(route, setFailed)))
251+
)
252+
253+
const docProcessingPromises = allMdxFilePaths.map(async (route) => {
254+
const doc = documentMap.get(route.slug)
255+
if (doc) {
256+
const tree = (await markdownProcessor.process(doc.body)).contents
257+
return traverseTreeAndValidateLinks(tree, doc, setFailed)
258+
} else {
259+
return {
260+
doc: {} as Document,
261+
link: [],
262+
hash: [],
263+
source: [],
264+
related: [],
265+
} as Errors
266+
}
267+
})
268+
269+
const allErrors = await Promise.all(docProcessingPromises)
270+
271+
let errorsExist = false
272+
273+
let errorRows: string[] = []
274+
275+
const errorTypes: ErrorType[] = ['link', 'hash', 'source', 'related']
276+
allErrors.forEach((errors) => {
277+
const {
278+
doc: { path: docPath },
279+
} = errors
280+
281+
errorTypes.forEach((errorType) => {
282+
if (errors[errorType].length > 0) {
283+
errorsExist = true
284+
errors[errorType].forEach((link) => {
285+
errorRows.push(formatTableRow(link, errorType, docPath, sha))
286+
})
287+
}
288+
})
289+
})
290+
291+
const errorComment = [
292+
'Hi there :wave:\n\nIt looks like this PR introduces broken links to the docs, please take a moment to fix them before merging:\n\n| Broken link | Type | File | \n| ----------- | ----------- | ----------- | \n',
293+
...errorRows,
294+
'\nThank you :pray:',
295+
].join('')
296+
297+
if (errorsExist) {
298+
await useComment?.(errorComment, errorsExist)
299+
const errorTableData = allErrors.flatMap((errors) => {
300+
const { doc } = errors
301+
302+
return errorTypes.flatMap((errorType) =>
303+
errors[errorType].map((link) => ({
304+
docPath: doc.path,
305+
errorType,
306+
link,
307+
}))
308+
)
309+
})
310+
311+
console.log('This PR introduces broken links to the docs:')
312+
console.table(errorTableData, ['link', 'errorType', 'docPath'])
313+
} else {
314+
await useComment?.('All broken links are now fixed, thank you!', errorsExist)
315+
console.log("This PR doesn't introduce any broken links to the docs. :D")
316+
}
317+
} catch (error) {
318+
setFailed('Error validating internal links: ' + error)
319+
}
320+
}
321+
322+
if (process.argv[2] === '--run-local-checker') {
323+
validateAllInternalLinks('../../../', (message) => {throw new Error(message)})
324+
}

0 commit comments

Comments
 (0)