Skip to content

Commit 2fe74dc

Browse files
authored
check broken links using localhost (github#27010)
* Revert "fix(link-check): add WAF token to avoid 403 (github#26964)" This reverts commit 8bb5251. * check broken links using localhost * remove temp comment
1 parent e534711 commit 2fe74dc

File tree

2 files changed

+147
-71
lines changed

2 files changed

+147
-71
lines changed

.github/workflows/check-broken-links-github-github.yml

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,20 @@ jobs:
5050
- name: Install Node.js dependencies
5151
run: npm ci
5252

53-
- name: Run broken github/github link check
53+
- name: Build server
54+
run: npm run build
55+
56+
- name: Start server in the background
5457
env:
55-
WAF_TOKEN: ${{ secrets.WAF_TOKEN }}
58+
NODE_ENV: production
59+
PORT: 4000
60+
run: |
61+
62+
node server.mjs &
63+
sleep 5
64+
curl --retry-connrefused --retry 3 -I http://localhost:4000/
65+
66+
- name: Run broken github/github link check
5667
run: |
5768
script/check-github-github-links.js > broken_github_github_links.md
5869

script/check-github-github-links.js

Lines changed: 134 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,60 @@
66
//
77
// [end-readme]
88

9+
import fs from 'fs/promises'
10+
11+
import got, { RequestError } from 'got'
12+
913
import { getContents, getPathsWithMatchingStrings } from './helpers/git-utils.js'
10-
import got from 'got'
1114

1215
if (!process.env.GITHUB_TOKEN) {
13-
console.error('Error! You must have a GITHUB_TOKEN set in an .env file to run this script.')
14-
process.exit(1)
16+
throw new Error('Error! You must have a GITHUB_TOKEN set in an .env file to run this script.')
1517
}
1618

17-
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
19+
const FORCE_DOWNLOAD = Boolean(JSON.parse(process.env.FORCE_DOWNLOAD || 'false'))
20+
const BATCH_SIZE = JSON.parse(process.env.BATCH_SIZE || '10')
21+
const BASE_URL = process.env.BASE_URL || 'http://localhost:4000'
1822

1923
main()
2024

25+
// The way `got` does retries:
26+
//
27+
// sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
28+
//
29+
// So, it means:
30+
//
31+
// 1. ~1000ms
32+
// 2. ~2000ms
33+
// 3. ~4000ms
34+
//
35+
// ...if the limit we set is 3.
36+
// Our own timeout, in ./middleware/timeout.js defaults to 10 seconds.
37+
// So there's no point in trying more attempts than 3 because it would
38+
// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
39+
const retryConfiguration = {
40+
limit: 3,
41+
}
42+
// According to our Datadog metrics, the *average* time for the
43+
// the 'archive_enterprise_proxy' metric is ~70ms (excluding spikes)
44+
// which much less than 500ms.
45+
const timeoutConfiguration = 1000
46+
2147
async function main() {
2248
const searchStrings = ['https://docs.github.com', 'GitHub help_url', 'GitHub developer_help_url']
23-
const foundFiles = await getPathsWithMatchingStrings(searchStrings, 'github', 'github')
24-
const searchFiles = [...foundFiles]
49+
50+
const foundFiles = []
51+
try {
52+
foundFiles.push(...JSON.parse(await fs.readFile('/tmp/foundFiles.json', 'utf-8')))
53+
} catch (error) {
54+
if (!(error.code && error.code === 'ENOENT')) {
55+
throw error
56+
}
57+
}
58+
if (!foundFiles.length || FORCE_DOWNLOAD) {
59+
foundFiles.push(...(await getPathsWithMatchingStrings(searchStrings, 'github', 'github')))
60+
await fs.writeFile('/tmp/foundFiles.json', JSON.stringify(foundFiles, undefined, 2), 'utf-8')
61+
}
62+
const searchFiles = [...new Set(foundFiles)] // filters out dupes
2563
.filter((file) => endsWithAny(['.rb', '.yml', '.yaml', '.txt', '.pdf', '.erb', '.js'], file))
2664
.filter(
2765
(file) =>
@@ -35,79 +73,106 @@ async function main() {
3573
const urlRegEx =
3674
/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)/g
3775

38-
for (const file of searchFiles) {
39-
const contents = await getContents('github', 'github', 'master', file)
40-
41-
if (
42-
contents.includes('https://docs.github.com') ||
43-
contents.includes('GitHub.help_url') ||
44-
contents.includes('GitHub.developer_help_url')
45-
) {
46-
const docsIndices = getIndicesOf('https://docs.github.com', contents)
47-
const helpIndices = getIndicesOf('GitHub.help_url', contents)
48-
helpIndices.push(...getIndicesOf('GitHub.developer_help_url', contents))
49-
if (docsIndices.length > 0) {
50-
docsIndices.forEach((numIndex) => {
51-
// Assuming we don't have links close to 500 characters long
52-
const docsLink = contents.substring(numIndex, numIndex + 500).match(urlRegEx)
53-
docsLinksFiles.push([docsLink[0].toString().replace(/[^a-zA-Z0-9]*$|\\n$/g, ''), file])
54-
})
55-
}
76+
try {
77+
docsLinksFiles.push(...JSON.parse(await fs.readFile('/tmp/docsLinksFiles.json', 'utf-8')))
78+
} catch (error) {
79+
if (!(error.code && error.code === 'ENOENT')) {
80+
throw error
81+
}
82+
}
5683

57-
if (helpIndices.length > 0) {
58-
helpIndices.forEach((numIndex) => {
59-
// There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
60-
if (
61-
(contents.substring(numIndex, numIndex + 11) === 'GitHub.help' &&
62-
contents.charAt(numIndex + 16) === '#') ||
63-
(contents.substring(numIndex, numIndex + 16) === 'GitHub.developer' &&
64-
contents.charAt(numIndex + 26) === '#')
65-
) {
66-
return
67-
}
84+
if (!docsLinksFiles.length || FORCE_DOWNLOAD) {
85+
for (const file of searchFiles) {
86+
const contents = await getContents('github', 'github', 'master', file)
87+
88+
if (
89+
contents.includes('https://docs.github.com') ||
90+
contents.includes('GitHub.help_url') ||
91+
contents.includes('GitHub.developer_help_url')
92+
) {
93+
const docsIndices = getIndicesOf('https://docs.github.com', contents)
94+
const helpIndices = getIndicesOf('GitHub.help_url', contents)
95+
helpIndices.push(...getIndicesOf('GitHub.developer_help_url', contents))
96+
if (docsIndices.length > 0) {
97+
docsIndices.forEach((numIndex) => {
98+
// Assuming we don't have links close to 500 characters long
99+
const docsLink = contents.substring(numIndex, numIndex + 500).match(urlRegEx)
100+
const linkURL = new URL(docsLink[0].toString().replace(/[^a-zA-Z0-9]*$|\\n$/g, ''))
101+
const linkPath = linkURL.pathname + linkURL.hash
102+
docsLinksFiles.push({ linkPath, file })
103+
})
104+
}
105+
106+
if (helpIndices.length > 0) {
107+
helpIndices.forEach((numIndex) => {
108+
// There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
109+
if (
110+
(contents.substring(numIndex, numIndex + 11) === 'GitHub.help' &&
111+
contents.charAt(numIndex + 16) === '#') ||
112+
(contents.substring(numIndex, numIndex + 16) === 'GitHub.developer' &&
113+
contents.charAt(numIndex + 26) === '#')
114+
) {
115+
return
116+
}
68117

69-
const startSearchIndex = contents.indexOf('/', numIndex)
70-
// Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
71-
// There are certain links that don't start with `/` so we want to skip those.
72-
// If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
73-
if (startSearchIndex - numIndex < 30) {
74-
const linkPath = contents
75-
.substring(
76-
startSearchIndex,
77-
regexIndexOf(
78-
contents,
79-
/\n|"\)|{@email_tracking_params}|\^http|Ahttps|example|This|TODO"|[{}|"%><.,')* ]/,
80-
startSearchIndex + 1
118+
const startSearchIndex = contents.indexOf('/', numIndex)
119+
// Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
120+
// There are certain links that don't start with `/` so we want to skip those.
121+
// If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
122+
if (startSearchIndex - numIndex < 30) {
123+
const linkPath = contents
124+
.substring(
125+
startSearchIndex,
126+
regexIndexOf(
127+
contents,
128+
/\n|"\)|{@email_tracking_params}|\^http|Ahttps|example|This|TODO"|[{}|"%><.,')* ]/,
129+
startSearchIndex + 1
130+
)
81131
)
82-
)
83-
.trim()
132+
.trim()
84133

85-
// Certain specific links can be ignored as well
86-
if (['/deprecation-1'].includes(linkPath)) {
87-
return
88-
}
134+
// Certain specific links can be ignored as well
135+
if (['/deprecation-1'].includes(linkPath)) {
136+
return
137+
}
89138

90-
docsLinksFiles.push([`https://docs.github.com${linkPath}`, file])
91-
}
92-
})
139+
docsLinksFiles.push({ linkPath, file })
140+
}
141+
})
142+
}
93143
}
94144
}
145+
await fs.writeFile(
146+
'/tmp/docsLinksFiles.json',
147+
JSON.stringify(docsLinksFiles, undefined, 2),
148+
'utf-8'
149+
)
95150
}
96-
97151
const brokenLinks = []
98-
// Done serially with delay to avoid hitting the rate limiter
99-
for (const file of docsLinksFiles) {
100-
try {
101-
await got(file[0], {
102-
headers: {
103-
'X-WAF-TOKEN': process.env.WAF_TOKEN,
104-
},
152+
153+
// Break up the long list of URLs to test into batches
154+
for (const batch of [...Array(Math.floor(docsLinksFiles.length / BATCH_SIZE)).keys()]) {
155+
const slice = docsLinksFiles.slice(batch * BATCH_SIZE, batch * BATCH_SIZE + BATCH_SIZE)
156+
await Promise.all(
157+
slice.map(async ({ linkPath, file }) => {
158+
// This isn't necessary but if it can't be constructed, it'll
159+
// fail in quite a nice way and not "blame got".
160+
const url = new URL(BASE_URL + linkPath)
161+
try {
162+
await got(url.href, {
163+
retry: retryConfiguration,
164+
timeout: timeoutConfiguration,
165+
})
166+
} catch (error) {
167+
if (error instanceof RequestError) {
168+
brokenLinks.push({ linkPath, file })
169+
} else {
170+
console.warn(`URL when it threw: ${url}`)
171+
throw error
172+
}
173+
}
105174
})
106-
} catch (e) {
107-
brokenLinks.push(file)
108-
} finally {
109-
await sleep(300)
110-
}
175+
)
111176
}
112177

113178
if (!brokenLinks.length) {

0 commit comments

Comments
 (0)