check broken links using localhost (github#27010)

peterbe · web-flow · commit 2fe74dc32b8b · 2022-04-19T15:17:37.000Z
* Revert "fix(link-check): add WAF token to avoid 403 (github#26964)" This reverts commit 8bb5251. * check broken links using localhost * remove temp comment
diff --git a/.github/workflows/check-broken-links-github-github.yml b/.github/workflows/check-broken-links-github-github.yml
@@ -50,9 +50,20 @@ jobs:
       - name: Install Node.js dependencies
         run: npm ci
 
-      - name: Run broken github/github link check
+      - name: Build server
+        run: npm run build
+
+      - name: Start server in the background
         env:
-          WAF_TOKEN: ${{ secrets.WAF_TOKEN }}
+          NODE_ENV: production
+          PORT: 4000
+        run: |
+
+          node server.mjs &
+          sleep 5
+          curl --retry-connrefused --retry 3 -I http://localhost:4000/
+
+      - name: Run broken github/github link check
         run: |
           script/check-github-github-links.js > broken_github_github_links.md
 
diff --git a/script/check-github-github-links.js b/script/check-github-github-links.js
@@ -6,22 +6,60 @@
 //
 // [end-readme]
 
+import fs from 'fs/promises'
+
+import got, { RequestError } from 'got'
+
 import { getContents, getPathsWithMatchingStrings } from './helpers/git-utils.js'
-import got from 'got'
 
 if (!process.env.GITHUB_TOKEN) {
-  console.error('Error! You must have a GITHUB_TOKEN set in an .env file to run this script.')
-  process.exit(1)
+  throw new Error('Error! You must have a GITHUB_TOKEN set in an .env file to run this script.')
 }
 
-const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
+const FORCE_DOWNLOAD = Boolean(JSON.parse(process.env.FORCE_DOWNLOAD || 'false'))
+const BATCH_SIZE = JSON.parse(process.env.BATCH_SIZE || '10')
+const BASE_URL = process.env.BASE_URL || 'http://localhost:4000'
 
 main()
 
+// The way `got` does retries:
+//
+//   sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
+//
+// So, it means:
+//
+//   1. ~1000ms
+//   2. ~2000ms
+//   3. ~4000ms
+//
+// ...if the limit we set is 3.
+// Our own timeout, in ./middleware/timeout.js defaults to 10 seconds.
+// So there's no point in trying more attempts than 3 because it would
+// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
+const retryConfiguration = {
+  limit: 3,
+}
+// According to our Datadog metrics, the *average* time for the
+// the 'archive_enterprise_proxy' metric is ~70ms (excluding spikes)
+// which much less than 500ms.
+const timeoutConfiguration = 1000
+
 async function main() {
   const searchStrings = ['https://docs.github.com', 'GitHub help_url', 'GitHub developer_help_url']
-  const foundFiles = await getPathsWithMatchingStrings(searchStrings, 'github', 'github')
-  const searchFiles = [...foundFiles]
+
+  const foundFiles = []
+  try {
+    foundFiles.push(...JSON.parse(await fs.readFile('/tmp/foundFiles.json', 'utf-8')))
+  } catch (error) {
+    if (!(error.code && error.code === 'ENOENT')) {
+      throw error
+    }
+  }
+  if (!foundFiles.length || FORCE_DOWNLOAD) {
+    foundFiles.push(...(await getPathsWithMatchingStrings(searchStrings, 'github', 'github')))
+    await fs.writeFile('/tmp/foundFiles.json', JSON.stringify(foundFiles, undefined, 2), 'utf-8')
+  }
+  const searchFiles = [...new Set(foundFiles)] // filters out dupes
     .filter((file) => endsWithAny(['.rb', '.yml', '.yaml', '.txt', '.pdf', '.erb', '.js'], file))
     .filter(
       (file) =>
@@ -35,79 +73,106 @@ async function main() {
   const urlRegEx =
     /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)/g
 
-  for (const file of searchFiles) {
-    const contents = await getContents('github', 'github', 'master', file)
-
-    if (
-      contents.includes('https://docs.github.com') ||
-      contents.includes('GitHub.help_url') ||
-      contents.includes('GitHub.developer_help_url')
-    ) {
-      const docsIndices = getIndicesOf('https://docs.github.com', contents)
-      const helpIndices = getIndicesOf('GitHub.help_url', contents)
-      helpIndices.push(...getIndicesOf('GitHub.developer_help_url', contents))
-      if (docsIndices.length > 0) {
-        docsIndices.forEach((numIndex) => {
-          // Assuming we don't have links close to 500 characters long
-          const docsLink = contents.substring(numIndex, numIndex + 500).match(urlRegEx)
-          docsLinksFiles.push([docsLink[0].toString().replace(/[^a-zA-Z0-9]*$|\\n$/g, ''), file])
-        })
-      }
+  try {
+    docsLinksFiles.push(...JSON.parse(await fs.readFile('/tmp/docsLinksFiles.json', 'utf-8')))
+  } catch (error) {
+    if (!(error.code && error.code === 'ENOENT')) {
+      throw error
+    }
+  }
 
-      if (helpIndices.length > 0) {
-        helpIndices.forEach((numIndex) => {
-          // There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
-          if (
-            (contents.substring(numIndex, numIndex + 11) === 'GitHub.help' &&
-              contents.charAt(numIndex + 16) === '#') ||
-            (contents.substring(numIndex, numIndex + 16) === 'GitHub.developer' &&
-              contents.charAt(numIndex + 26) === '#')
-          ) {
-            return
-          }
+  if (!docsLinksFiles.length || FORCE_DOWNLOAD) {
+    for (const file of searchFiles) {
+      const contents = await getContents('github', 'github', 'master', file)
+
+      if (
+        contents.includes('https://docs.github.com') ||
+        contents.includes('GitHub.help_url') ||
+        contents.includes('GitHub.developer_help_url')
+      ) {
+        const docsIndices = getIndicesOf('https://docs.github.com', contents)
+        const helpIndices = getIndicesOf('GitHub.help_url', contents)
+        helpIndices.push(...getIndicesOf('GitHub.developer_help_url', contents))
+        if (docsIndices.length > 0) {
+          docsIndices.forEach((numIndex) => {
+            // Assuming we don't have links close to 500 characters long
+            const docsLink = contents.substring(numIndex, numIndex + 500).match(urlRegEx)
+            const linkURL = new URL(docsLink[0].toString().replace(/[^a-zA-Z0-9]*$|\\n$/g, ''))
+            const linkPath = linkURL.pathname + linkURL.hash
+            docsLinksFiles.push({ linkPath, file })
+          })
+        }
+
+        if (helpIndices.length > 0) {
+          helpIndices.forEach((numIndex) => {
+            // There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
+            if (
+              (contents.substring(numIndex, numIndex + 11) === 'GitHub.help' &&
+                contents.charAt(numIndex + 16) === '#') ||
+              (contents.substring(numIndex, numIndex + 16) === 'GitHub.developer' &&
+                contents.charAt(numIndex + 26) === '#')
+            ) {
+              return
+            }
 
-          const startSearchIndex = contents.indexOf('/', numIndex)
-          // Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
-          // There are certain links that don't start with `/` so we want to skip those.
-          // If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
-          if (startSearchIndex - numIndex < 30) {
-            const linkPath = contents
-              .substring(
-                startSearchIndex,
-                regexIndexOf(
-                  contents,
-                  /\n|"\)|{@email_tracking_params}|\^http|Ahttps|example|This|TODO"|[{}|"%><.,')* ]/,
-                  startSearchIndex + 1
+            const startSearchIndex = contents.indexOf('/', numIndex)
+            // Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
+            // There are certain links that don't start with `/` so we want to skip those.
+            // If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
+            if (startSearchIndex - numIndex < 30) {
+              const linkPath = contents
+                .substring(
+                  startSearchIndex,
+                  regexIndexOf(
+                    contents,
+                    /\n|"\)|{@email_tracking_params}|\^http|Ahttps|example|This|TODO"|[{}|"%><.,')* ]/,
+                    startSearchIndex + 1
+                  )
                 )
-              )
-              .trim()
+                .trim()
 
-            // Certain specific links can be ignored as well
-            if (['/deprecation-1'].includes(linkPath)) {
-              return
-            }
+              // Certain specific links can be ignored as well
+              if (['/deprecation-1'].includes(linkPath)) {
+                return
+              }
 
-            docsLinksFiles.push([`https://docs.github.com${linkPath}`, file])
-          }
-        })
+              docsLinksFiles.push({ linkPath, file })
+            }
+          })
+        }
       }
     }
+    await fs.writeFile(
+      '/tmp/docsLinksFiles.json',
+      JSON.stringify(docsLinksFiles, undefined, 2),
+      'utf-8'
+    )
   }
-
   const brokenLinks = []
-  // Done serially with delay to avoid hitting the rate limiter
-  for (const file of docsLinksFiles) {
-    try {
-      await got(file[0], {
-        headers: {
-          'X-WAF-TOKEN': process.env.WAF_TOKEN,
-        },
+
+  // Break up the long list of URLs to test into batches
+  for (const batch of [...Array(Math.floor(docsLinksFiles.length / BATCH_SIZE)).keys()]) {
+    const slice = docsLinksFiles.slice(batch * BATCH_SIZE, batch * BATCH_SIZE + BATCH_SIZE)
+    await Promise.all(
+      slice.map(async ({ linkPath, file }) => {
+        // This isn't necessary but if it can't be constructed, it'll
+        // fail in quite a nice way and not "blame got".
+        const url = new URL(BASE_URL + linkPath)
+        try {
+          await got(url.href, {
+            retry: retryConfiguration,
+            timeout: timeoutConfiguration,
+          })
+        } catch (error) {
+          if (error instanceof RequestError) {
+            brokenLinks.push({ linkPath, file })
+          } else {
+            console.warn(`URL when it threw: ${url}`)
+            throw error
+          }
+        }
       })
-    } catch (e) {
-      brokenLinks.push(file)
-    } finally {
-      await sleep(300)
-    }
+    )
   }
 
   if (!brokenLinks.length) {