6
6
//
7
7
// [end-readme]
8
8
9
+ import fs from 'fs/promises'
10
+
11
+ import got , { RequestError } from 'got'
12
+
9
13
import { getContents , getPathsWithMatchingStrings } from './helpers/git-utils.js'
10
- import got from 'got'
11
14
12
15
if ( ! process . env . GITHUB_TOKEN ) {
13
- console . error ( 'Error! You must have a GITHUB_TOKEN set in an .env file to run this script.' )
14
- process . exit ( 1 )
16
+ throw new Error ( 'Error! You must have a GITHUB_TOKEN set in an .env file to run this script.' )
15
17
}
16
18
17
- const sleep = ( ms ) => new Promise ( ( resolve ) => setTimeout ( resolve , ms ) )
19
+ const FORCE_DOWNLOAD = Boolean ( JSON . parse ( process . env . FORCE_DOWNLOAD || 'false' ) )
20
+ const BATCH_SIZE = JSON . parse ( process . env . BATCH_SIZE || '10' )
21
+ const BASE_URL = process . env . BASE_URL || 'http://localhost:4000'
18
22
19
23
main ( )
20
24
25
+ // The way `got` does retries:
26
+ //
27
+ // sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
28
+ //
29
+ // So, it means:
30
+ //
31
+ // 1. ~1000ms
32
+ // 2. ~2000ms
33
+ // 3. ~4000ms
34
+ //
35
+ // ...if the limit we set is 3.
36
+ // Our own timeout, in ./middleware/timeout.js defaults to 10 seconds.
37
+ // So there's no point in trying more attempts than 3 because it would
38
+ // just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
39
+ const retryConfiguration = {
40
+ limit : 3 ,
41
+ }
42
+ // According to our Datadog metrics, the *average* time for the
43
+ // the 'archive_enterprise_proxy' metric is ~70ms (excluding spikes)
44
+ // which much less than 500ms.
45
+ const timeoutConfiguration = 1000
46
+
21
47
async function main ( ) {
22
48
const searchStrings = [ 'https://docs.github.com' , 'GitHub help_url' , 'GitHub developer_help_url' ]
23
- const foundFiles = await getPathsWithMatchingStrings ( searchStrings , 'github' , 'github' )
24
- const searchFiles = [ ...foundFiles ]
49
+
50
+ const foundFiles = [ ]
51
+ try {
52
+ foundFiles . push ( ...JSON . parse ( await fs . readFile ( '/tmp/foundFiles.json' , 'utf-8' ) ) )
53
+ } catch ( error ) {
54
+ if ( ! ( error . code && error . code === 'ENOENT' ) ) {
55
+ throw error
56
+ }
57
+ }
58
+ if ( ! foundFiles . length || FORCE_DOWNLOAD ) {
59
+ foundFiles . push ( ...( await getPathsWithMatchingStrings ( searchStrings , 'github' , 'github' ) ) )
60
+ await fs . writeFile ( '/tmp/foundFiles.json' , JSON . stringify ( foundFiles , undefined , 2 ) , 'utf-8' )
61
+ }
62
+ const searchFiles = [ ...new Set ( foundFiles ) ] // filters out dupes
25
63
. filter ( ( file ) => endsWithAny ( [ '.rb' , '.yml' , '.yaml' , '.txt' , '.pdf' , '.erb' , '.js' ] , file ) )
26
64
. filter (
27
65
( file ) =>
@@ -35,79 +73,106 @@ async function main() {
35
73
const urlRegEx =
36
74
/ h t t p s ? : \/ \/ ( w w w \. ) ? [ - a - z A - Z 0 - 9 @ : % . _ + ~ # = ] { 1 , 256 } \. [ a - z A - Z 0 - 9 ( ) ] { 1 , 6 } \b ( [ - a - z A - Z 0 - 9 ( ) @ : % _ + . ~ # ? & / / = ] * ) / g
37
75
38
- for ( const file of searchFiles ) {
39
- const contents = await getContents ( 'github' , 'github' , 'master' , file )
40
-
41
- if (
42
- contents . includes ( 'https://docs.github.com' ) ||
43
- contents . includes ( 'GitHub.help_url' ) ||
44
- contents . includes ( 'GitHub.developer_help_url' )
45
- ) {
46
- const docsIndices = getIndicesOf ( 'https://docs.github.com' , contents )
47
- const helpIndices = getIndicesOf ( 'GitHub.help_url' , contents )
48
- helpIndices . push ( ...getIndicesOf ( 'GitHub.developer_help_url' , contents ) )
49
- if ( docsIndices . length > 0 ) {
50
- docsIndices . forEach ( ( numIndex ) => {
51
- // Assuming we don't have links close to 500 characters long
52
- const docsLink = contents . substring ( numIndex , numIndex + 500 ) . match ( urlRegEx )
53
- docsLinksFiles . push ( [ docsLink [ 0 ] . toString ( ) . replace ( / [ ^ a - z A - Z 0 - 9 ] * $ | \\ n $ / g, '' ) , file ] )
54
- } )
55
- }
76
+ try {
77
+ docsLinksFiles . push ( ...JSON . parse ( await fs . readFile ( '/tmp/docsLinksFiles.json' , 'utf-8' ) ) )
78
+ } catch ( error ) {
79
+ if ( ! ( error . code && error . code === 'ENOENT' ) ) {
80
+ throw error
81
+ }
82
+ }
56
83
57
- if ( helpIndices . length > 0 ) {
58
- helpIndices . forEach ( ( numIndex ) => {
59
- // There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
60
- if (
61
- ( contents . substring ( numIndex , numIndex + 11 ) === 'GitHub.help' &&
62
- contents . charAt ( numIndex + 16 ) === '#' ) ||
63
- ( contents . substring ( numIndex , numIndex + 16 ) === 'GitHub.developer' &&
64
- contents . charAt ( numIndex + 26 ) === '#' )
65
- ) {
66
- return
67
- }
84
+ if ( ! docsLinksFiles . length || FORCE_DOWNLOAD ) {
85
+ for ( const file of searchFiles ) {
86
+ const contents = await getContents ( 'github' , 'github' , 'master' , file )
87
+
88
+ if (
89
+ contents . includes ( 'https://docs.github.com' ) ||
90
+ contents . includes ( 'GitHub.help_url' ) ||
91
+ contents . includes ( 'GitHub.developer_help_url' )
92
+ ) {
93
+ const docsIndices = getIndicesOf ( 'https://docs.github.com' , contents )
94
+ const helpIndices = getIndicesOf ( 'GitHub.help_url' , contents )
95
+ helpIndices . push ( ...getIndicesOf ( 'GitHub.developer_help_url' , contents ) )
96
+ if ( docsIndices . length > 0 ) {
97
+ docsIndices . forEach ( ( numIndex ) => {
98
+ // Assuming we don't have links close to 500 characters long
99
+ const docsLink = contents . substring ( numIndex , numIndex + 500 ) . match ( urlRegEx )
100
+ const linkURL = new URL ( docsLink [ 0 ] . toString ( ) . replace ( / [ ^ a - z A - Z 0 - 9 ] * $ | \\ n $ / g, '' ) )
101
+ const linkPath = linkURL . pathname + linkURL . hash
102
+ docsLinksFiles . push ( { linkPath, file } )
103
+ } )
104
+ }
105
+
106
+ if ( helpIndices . length > 0 ) {
107
+ helpIndices . forEach ( ( numIndex ) => {
108
+ // There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
109
+ if (
110
+ ( contents . substring ( numIndex , numIndex + 11 ) === 'GitHub.help' &&
111
+ contents . charAt ( numIndex + 16 ) === '#' ) ||
112
+ ( contents . substring ( numIndex , numIndex + 16 ) === 'GitHub.developer' &&
113
+ contents . charAt ( numIndex + 26 ) === '#' )
114
+ ) {
115
+ return
116
+ }
68
117
69
- const startSearchIndex = contents . indexOf ( '/' , numIndex )
70
- // Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
71
- // There are certain links that don't start with `/` so we want to skip those.
72
- // If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
73
- if ( startSearchIndex - numIndex < 30 ) {
74
- const linkPath = contents
75
- . substring (
76
- startSearchIndex ,
77
- regexIndexOf (
78
- contents ,
79
- / \n | " \) | { @ e m a i l _ t r a c k i n g _ p a r a m s } | \^ h t t p | A h t t p s | e x a m p l e | T h i s | T O D O " | [ { } | " % > < . , ' ) * ] / ,
80
- startSearchIndex + 1
118
+ const startSearchIndex = contents . indexOf ( '/' , numIndex )
119
+ // Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
120
+ // There are certain links that don't start with `/` so we want to skip those.
121
+ // If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
122
+ if ( startSearchIndex - numIndex < 30 ) {
123
+ const linkPath = contents
124
+ . substring (
125
+ startSearchIndex ,
126
+ regexIndexOf (
127
+ contents ,
128
+ / \n | " \) | { @ e m a i l _ t r a c k i n g _ p a r a m s } | \^ h t t p | A h t t p s | e x a m p l e | T h i s | T O D O " | [ { } | " % > < . , ' ) * ] / ,
129
+ startSearchIndex + 1
130
+ )
81
131
)
82
- )
83
- . trim ( )
132
+ . trim ( )
84
133
85
- // Certain specific links can be ignored as well
86
- if ( [ '/deprecation-1' ] . includes ( linkPath ) ) {
87
- return
88
- }
134
+ // Certain specific links can be ignored as well
135
+ if ( [ '/deprecation-1' ] . includes ( linkPath ) ) {
136
+ return
137
+ }
89
138
90
- docsLinksFiles . push ( [ `https://docs.github.com${ linkPath } ` , file ] )
91
- }
92
- } )
139
+ docsLinksFiles . push ( { linkPath, file } )
140
+ }
141
+ } )
142
+ }
93
143
}
94
144
}
145
+ await fs . writeFile (
146
+ '/tmp/docsLinksFiles.json' ,
147
+ JSON . stringify ( docsLinksFiles , undefined , 2 ) ,
148
+ 'utf-8'
149
+ )
95
150
}
96
-
97
151
const brokenLinks = [ ]
98
- // Done serially with delay to avoid hitting the rate limiter
99
- for ( const file of docsLinksFiles ) {
100
- try {
101
- await got ( file [ 0 ] , {
102
- headers : {
103
- 'X-WAF-TOKEN' : process . env . WAF_TOKEN ,
104
- } ,
152
+
153
+ // Break up the long list of URLs to test into batches
154
+ for ( const batch of [ ...Array ( Math . floor ( docsLinksFiles . length / BATCH_SIZE ) ) . keys ( ) ] ) {
155
+ const slice = docsLinksFiles . slice ( batch * BATCH_SIZE , batch * BATCH_SIZE + BATCH_SIZE )
156
+ await Promise . all (
157
+ slice . map ( async ( { linkPath, file } ) => {
158
+ // This isn't necessary but if it can't be constructed, it'll
159
+ // fail in quite a nice way and not "blame got".
160
+ const url = new URL ( BASE_URL + linkPath )
161
+ try {
162
+ await got ( url . href , {
163
+ retry : retryConfiguration ,
164
+ timeout : timeoutConfiguration ,
165
+ } )
166
+ } catch ( error ) {
167
+ if ( error instanceof RequestError ) {
168
+ brokenLinks . push ( { linkPath, file } )
169
+ } else {
170
+ console . warn ( `URL when it threw: ${ url } ` )
171
+ throw error
172
+ }
173
+ }
105
174
} )
106
- } catch ( e ) {
107
- brokenLinks . push ( file )
108
- } finally {
109
- await sleep ( 300 )
110
- }
175
+ )
111
176
}
112
177
113
178
if ( ! brokenLinks . length ) {
0 commit comments