-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworkflows.js
692 lines (637 loc) · 27.8 KB
/
workflows.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
'use strict'
const { debuglog } = require('util')
const { setTimeout } = require('node:timers/promises')
const debug = debuglog('scraper')
const printTsvLine = (link, title, members, event) => {
event.reply('print', [link, title, members.join(';').replace(/[ \n]+/g, ' ')].join('\t') + '\n')
}
const workflows = {
ASCO: {
start: 'https://ascopubs.org/',
getData: async (page, event) => {
debug('Getting ASCO journal links...')
const journalLinks = await page.$$eval('a', (el) =>
el.filter((el) => el.href.startsWith('https://ascopubs.org/journal/')).map((el) => el.href)
)
const deduppedLinks = [...new Set(journalLinks)]
const publicationsList = []
for (const link of deduppedLinks) {
debug(`Getting title from ${link}...`)
await page.goto(link)
const title = await page.$eval('title', (el) => el.innerText)
if (!title.endsWith('Educational Book')) {
publicationsList.push({ href: link, title })
} else {
debug(`Skipping ${title}...`)
}
}
for (const publication of publicationsList) {
debug(`Getting editorial board members from ${publication.href}...`)
await page.goto(`${publication.href.replace('/journal', '')}/about/editorial-roster`)
const members = await page.$eval('div.tab-content', (el) => {
const entries = el.innerText.split('\n').filter((val) => val.includes('San Francisco') || val.includes('UCSF'))
return entries
})
printTsvLine(publication.href, publication.title, members, event)
}
}
},
AMA: {
start: 'https://jamanetwork.com/',
getData: async (page, event) => {
debug('Getting AMA journal names and links...')
const publicationsList = await page.$$eval('div.widget-instance-PublicationDropdown a', (el) =>
el.filter((el) => el.href.startsWith('http')).map((el) => { return { href: el.href, title: el.innerText } })
)
for (const publication of publicationsList) {
if (publication.href.startsWith('https://jamanetwork.com/journals/archneurpsyc')) {
continue
}
const links = [
publication.href.replace(/\/[^/]+$/, '/editors-and-publishers'),
publication.href.replace(/\/[^/]+$/, '/pages/about')
]
for (const link of links) {
const result = await page.goto(link)
const status = result.status()
debug(`Status: ${status}`)
if (status === 200) {
break
}
}
const members = await page.$eval('body', (el) =>
// Matching on San Francisco preceded by a space to keep it from matching job listings for San Francisco.
el.innerText.split('\n').filter((val) => val.includes(' San Francisco')).map((val) => val.slice(0, val.indexOf(',')))
)
printTsvLine(publication.href, publication.title, members, event)
}
}
},
BMC: {
start: 'https://www.biomedcentral.com/journals',
getData: async (page, event) => {
debug('Getting BMC board links...')
const boardLinks = await page.$$eval('a.u-ml-8', (el) => {
// Skip these links
const skipLinks = [
'https://www.biomedcentral.com/getpublished/peer-review-process',
'https://cancercommun.biomedcentral.com/' // no longer published by BMC
]
return el.filter((el) => !skipLinks.includes(el.href)).map((el) => `${el.href}about/editorial-board`)
})
for (const link of boardLinks) {
debug(`Getting title and board from ${link}...`)
await page.goto(link)
const rawTitle = (await page.title())
const title = rawTitle.substring(0, rawTitle.indexOf('|')).trim()
debug(`Got title ${title}...`)
const members = await page.$eval('body', (el) =>
el.innerText.split('\n').filter((val) => val.includes('San Francisco'))
)
printTsvLine(link, title, members, event)
}
}
},
BMJ: {
start: 'https://journals.bmj.com/',
getData: async (page, event) => {
debug('Getting BMJ journal titles and board links...')
const journals = await page.$$eval('li.journal-title a', (el) => {
return el.map((el) => { return { href: `${el.href}/pages/editorial-board/`, title: el.innerText } })
})
for (const { href, title } of journals) {
debug(`Getting editorial board members from ${href}...`)
await page.goto(href)
const members = await page.$$eval('p', (el) => {
const entries = el.filter((val) => val.innerText.includes('San Francisco'))
return entries.map((el) => {
const text = el.innerText
return text.substring(0, text.indexOf('\n'))
})
}
)
printTsvLine(href, title, members, event)
}
}
},
Elsevier: {
start: 'https://www.sciencedirect.com/browse/journals-and-books?contentType=JL',
getData: async (page, event) => {
// TODO: This should be done for all workflows, probably.
await page.setRequestInterception(true)
page.on('request', (req) => {
if (['image', 'stylesheet', 'font'].includes(req.resourceType())) {
req.abort()
} else {
req.continue()
}
})
const publicationsList = []
debug('Getting Elsevier journal names and links...')
const links = await page.$$eval('a.js-publication-title', (el) => {
return el.map(el => { return { title: el.innerText, link: el.href } })
})
publicationsList.push(...links)
let previousTitle = links[0].title
debug('Got first set of journal links....')
while (true) {
debug('Waiting for next-page link to appear on page...')
await page.waitForSelector('button[aria-label="Next page"]')
debug('Getting next page link...')
const linkIsDisabled = await page.$eval('button[aria-label="Next page"]', (el) => el.disabled)
if (linkIsDisabled) {
debug('Next page link is disabled, no more links...')
break
}
debug('Clicking next page link...')
await page.click('button[aria-label="Next page"]')
while (await page.$eval('a.js-publication-title', (el) => el.innerText) === previousTitle) {
debug('Waiting for page to load...')
await setTimeout(100)
}
debug('Getting journal links...')
const links = await page.$$eval('a.js-publication-title', (el) => {
return el.map(el => { return { title: el.innerText, link: el.href } })
})
publicationsList.push(...links)
debug(`Previous first title: ${previousTitle}, current first title: ${links[0].title}`)
previousTitle = links[0].title
}
debug(`Retrieved ${publicationsList.length} journal links...`)
for (const publication of publicationsList) {
// debug('Chilling out for 2 seconds before getting editorial board...')
// await setTimeout(2000)
const editorsLink = publication.link + '/about/editorial-board'
debug(`Navigating to ${editorsLink}...`)
try {
await page.goto(editorsLink, { waitFor: 'networkidle2' })
} catch (e) {
debug(`Error navigating to ${editorsLink}: ${e}`)
// It's probably a timeout on an image or something and it's probably fine to continue. ¯\_(ツ)_/¯
continue
}
const board = await page.$$eval('div.editor-group', (elements) => elements.map((el) => el.innerText))
debug('Full board: ' + JSON.stringify(board))
const members = board.filter((val) => /San Francisco|UCSF/.test(val))
debug('UCSF members: ' + JSON.stringify(members))
printTsvLine(publication.title, publication.link, members, event)
}
}
},
// TODO: Need to find an XML parser that is maintained.
// Until then, this code is broken. :-(
// LWW: {
// start: 'https://journals.lww.com/_layouts/15/oaks.journals/Sitemap_xml.aspx?format=xml',
// getData: async (page, event) => {
// const data = await page.$eval('body .pretty-print', (el) => el.innerText)
// // const xml = await xml2js.parseStringPromise(data)
// const boardLinks = xml.sitemapindex.sitemap
// .map((value) => value.loc[0])
// .map((value) => value.replace(/_layouts\/15\/oaks\.journals\/sitemap_xml\.aspx$/, ''))
// let timeout = 30000 // 30 seconds
// page.setDefaultTimeout(timeout)
// const tryLinks = async (links, options) => {
// if (options?.backoff) {
// timeout = timeout * 2
// debug(`Increasing timeout to ${timeout}ms`)
// page.setDefaultTimeout(timeout)
// }
// let title
// for (const link of links) {
// debug(`Trying ${link}`)
// const result = await page.goto(link, { waitFor: 'networkidle2' })
// const status = result.status()
// debug(`Status: ${status}`)
// if (status === 404) {
// continue
// }
// title = (await page.title()).trim()
// if (title === 'Just a moment...') {
// debug('Awaiting navigation')
// try {
// await page.waitForSelector('#aspnetForm')
// } catch (e) {
// if (e.name === 'TimeoutError') {
// debug(`Timeout error on ${link}, trying again with backoff`)
// // TODO: stop at some point so you're not in an infinite loop?
// title = (await tryLinks([link], { backoff: true }))
// } else {
// throw e
// }
// }
// title = (await page.title()).trim()
// }
// debug(`Got title ${title}`)
// if (!title.startsWith('Page Nor Found')) {
// break
// }
// }
// if (!title) {
// console.warn(`Page not found for ${links}`)
// }
// return title
// }
// for (let i = 0; i < boardLinks.length; i++) {
// const linksToTry = [
// boardLinks[i] + 'Pages/editorialboard.aspx',
// boardLinks[i] + 'Pages/JournalMasthead.aspx',
// boardLinks[i] + 'Pages/JournalContactsEditorialBoard.aspx',
// boardLinks[i] + 'Pages/editorialadvisoryboard.aspx',
// boardLinks[i] + 'Pages/publicationstaff.aspx',
// boardLinks[i] + 'Pages/aboutthejournal.aspx'
// ]
// const title = await tryLinks(linksToTry)
// debug(`Title: ${title}`)
// const members = await page.$$eval('p', (els) => els.map((val) => val.innerText.replace(/[\u200B-\u200D\uFEFF]/g, '')).filter((val) => val.includes('San Francisco')))
// printTsvLine(boardLinks[i], title, members, event)
// }
// }
// },
'Mary Ann Liebert': {
timeout: 300000,
start: 'https://home.liebertpub.com/publications/a-z',
getData: async (page, event) => {
debug('Getting Mary Ann Liebert board links...')
const data = await page.$$eval('a.pub-title', (el) => {
return el.map(el => { return { title: el.innerText, link: `${el.href}/editorial-board` } })
})
const getMembers = async (page, selector) => {
return await page.$$eval(selector, (els) => els.map((val) => val.innerText.replace(/[\u200B-\u200D\uFEFF]/g, '')).filter((val) => val.includes('San Francisco') || val.includes('UCSF')))
}
for (let i = 0; i < data.length; i++) {
const { link, title } = data[i]
debug(`Title: ${title}`)
await page.goto(link, { waitFor: 'networkidle2' })
let members = await getMembers(page, 'div.editorial p')
if (members.length === 0) {
members = await getMembers(page, '.member')
}
printTsvLine(title, link, members, event)
}
}
},
// TODO: Nature looks busted. Fix it.
// Nature: {
// start: 'https://www.nature.com/siteindex',
// getData: async (page, event) => {
// debug('Getting Nature board links...')
// // Skip these links
// const data = await page.$$eval('#journals-az ul:not(.alpha-index) a', (el) => {
// return el.map(el => { return { title: el.innerText, link: el.href } })
// })
// for (let i = 0; i < data.length; i++) {
// const { link, title } = data[i]
// debug(`Title: ${title}`)
// // TODO: For at least one journal (bdj, British Dental Journal), the
// // editorial board is on the /about page under the #editors hash.
// // That practice will require special handling to avoid false positives.
// // We want to confirm that the editorial board (or at least the id
// // #editors) is actually there. For the other URLs, we're just checking
// // for page-not-found.
// const linksToTry = [
// link + '/editors',
// link + '/about/editors',
// link + '/about/editorial-board',
// link + '/about/editorialboard',
// link + '/about/editor'
// ]
// let status
// let editorsLink
// for (editorsLink of linksToTry) {
// debug(`Trying ${editorsLink}`)
// const result = await page.goto(editorsLink, { waitFor: 'networkidle2' })
// status = result.status()
// debug(`Status: ${status}`)
// if (status !== 404) {
// break
// }
// }
// if (status === 404) {
// console.log(`Editor page not found for ${title} (${link})\t\t`)
// continue
// }
// const members = await page.$$eval('p', (els) => els.map((val) => val.innerText.replace(/\n/g, ' ').match(/(.+)(?=San Francisco|UCSF)/))
// .filter(val => val !== null)
// .map(val => val[1])
// .map(val => val.substring(0, val.indexOf(' University of California')) || val)
// .map(val => val.trim())
// )
// printTsvLine(editorsLink, title, members, event)
// }
// }
// },
Oxford: {
start: 'https://academic.oup.com/journals/pages/journals_a_to_z',
getData: async (page, event) => {
debug('Getting Oxford journal links...')
const journals = await page.$$eval(
'.secondaryContent a',
(el) => el.map((val) => { return { title: val.innerHTML, href: val.href } }).filter((val) => /^https:\/\/academic\.oup\.com\//.test(val.href) && !/#[A-Z]$/.test(val.href))
)
for (let i = 0; i < journals.length; i++) {
const linkBase = journals[i].href.replace(/\/pages\/.*$/, '')
let link = `${linkBase}/pages/Editorial_Board`
const title = journals[i].title
debug(`Trying ${link} for ${title}`)
let result = await page.goto(link, { waitFor: 'networkidle2' })
let status = result.status()
debug(`Status: ${status}`)
let members
if (status === 200) {
members = await page.$$eval('p', (els) => els.map((val) => val.innerText).filter((val) => /San Francisco|UCSF/.test(val)))
printTsvLine(link, title, members, event)
continue
}
link = `${linkBase}/pages/editorial-board`
debug(`Trying ${link} for ${title}`)
result = await page.goto(link, { waitFor: 'networkidle2' })
status = result.status()
debug(`Status: ${status}`)
if (status === 200) {
members = await page.$$eval('nameGroup', (els) => els.map((val) => val.innerText.replaceAll('\n', '')).filter((val) => /San Francisco|UCSF/.test(val)))
printTsvLine(link, title, members, event)
continue
}
debug(`Can not find editorial board for ${title} (${linkBase})`)
}
}
},
PLoS: {
start: 'https://plos.org/your-journal-options/',
getData: async (page, event) => {
debug('Getting PLoS journal links...')
const publicationsList = await page.$$eval('a.journal-selector__list-item-background-image', (el) =>
el.map((el) => { return { title: el.innerText, href: el.href } })
)
debug('Accepting cookies...')
await page.click('button ::-p-text(Save Selected Preferences and Close)')
// PLoS One will probably be a lot different than the others.
for (const publication of publicationsList) {
debug(`Going to ${publication.title}...`)
await page.goto(publication.href)
debug('Clicking About button...')
await page.click('button ::-p-text(About)')
debug('Getting editorial board links...')
const editorsLinks = await page.$$eval('a', (el) => el.filter((el) => /Editorial Board|Staff Editors|Section Editors|Advisory Groups|Editors-in-Chief/.test(el.innerText)).map((el) => el.href))
const results = []
for (const link of editorsLinks) {
debug(`Navigating to ${link}...`)
await page.goto(link)
const members = await page.$$eval('p', (el) => el.map((val) => val.innerText).filter((val) => /San Francisco|UCSF/.test(val)))
if (members.length > 0) {
results.push(`Check ${link}`)
}
}
debug('Printing editorial board links...')
printTsvLine(publication.href, publication.title, results, event)
}
}
},
SAGE: {
// TODO: Check for a next-page link to see if the 2000 parameter needs to be increased
// or if this needs to be done in a loop (if SAGE decides to limit the number of journals
// returned in a single request).
timeout: 120000,
start: 'https://journals.sagepub.com/action/showPublications?startPage=0&pageSize=2000',
getData: async (page, event) => {
const tryLink = async (link, timeout) => {
page.setDefaultNavigationTimeout(timeout)
let result
try {
result = await page.goto(link)
} catch (e) {
// TODO: Stop at some point if this keeps happening.
if (e.name === 'TimeoutError') {
timeout = timeout * 2
debug(`Timeout error on ${link}, increasing timeout to ${timeout} and trying again`)
return await tryLink(link, timeout)
} else {
throw e
}
}
return result
}
debug('Getting SAGE journal links...')
const publicationsList = await page.$$eval('.item__title a', (el) =>
el.map((el) => { return { title: el.innerText, href: el.href } })
)
for (const publication of publicationsList) {
let result = await tryLink(publication.href, 60000)
let status = result.status()
debug(`Status: ${status}`)
if (status === 200) {
let href
try {
href = await page.$eval('a[data-id="view-editorial-board"]', (el) => el.href)
} catch (e) {
debug(`Skipping ${publication.title} (${publication.href}) because it does not have an editorial board. It is probably no longer published.`)
continue
}
result = await tryLink(href, 60000)
status = result.status()
if (status === 200) {
const members = await page.$$eval('div.editorial-board tr', (els) =>
els.map((val) => val.innerText).filter((val) => /San Francisco|UCSF/.test(val))
)
members.map((val) => val.replaceAll('\t', ', '))
printTsvLine(href, publication.title, members, event)
} else {
throw new Error(`Can not find editorial board for ${publication.title} (${href}): ${status}`)
}
} else {
throw new Error(`Can not find editorial board for ${publication.title} (${publication.href}): ${status}`)
}
}
}
},
Springer: {
start: 'https://link.springer.com/journals/a/1',
getData: async (page, event) => {
const publicationsList = []
debug('Getting Springer journal names and links...')
let morePages = true
const moreLinks = await page.$$eval('a.c-atoz-navigation__link', (el) => {
return el.map(el => el.href)
})
while (moreLinks.length) {
while (morePages) {
const morePublications = await page.$$eval('a.c-atoz-list__link', (el) => {
return el.map(el => { return { title: el.innerText, link: el.href } })
})
publicationsList.push(...morePublications)
debug(`Found ${publicationsList.length} journal titles so far...`)
// Check for "Next" link and follow it if it exists.
const next = await page.$x("//a[contains(., 'Next')]")
if (next.length > 0) {
await next[0].click()
await page.waitForNavigation()
} else {
morePages = false
}
}
await page.goto(moreLinks.shift())
morePages = true
}
for (const publication of publicationsList) {
const editorsLink = publication.link
.replace('link.springer.com', 'springer.com')
.replace(/volumes.?and.?issues\/?/i, '')
.concat('/editors')
// TODO: If this fails, maybe look for "Editorial Board" link? And if not,
// there, look for "Submission Guidelines" link, follow it, and look for
// "Editorial Board" there? Maybe don't even bother with the string
// munging above and do this instead? Although we can ignore any BMC
// domains because we already did them.
debug(`Navigating to ${editorsLink}`)
await page.goto(editorsLink)
// TODO: This returns the location or institution a lot and not always
// the name.
const members = await page.$eval('body', (el) =>
el.innerText.split('\n')
.filter((val) => /San Francisco|UCSF/.test(val))
.map((val) => val.slice(0, val.indexOf('(')))
)
printTsvLine(publication.title, publication.link, members, event)
}
}
},
'Taylor & Francis': {
timeout: 300000,
start: 'https://www.tandfonline.com/action/showPublications?pubType=journal&startPage=&pageSize=99999',
getData: async (page, event) => {
debug('Getting Taylor & Francis journal names and links...')
const publications = await page.$$eval('.art_title a', (el) => {
return el.map((el) => { return { title: el.innerText, link: el.href } })
})
for (const publication of publications) {
const editorialBoardLink = publication.link.replace('/journals/', '/action/journalInformation?show=editorialBoard&journalCode=')
// TODO: robots-txt-parser is very buggy and I need to find a replacement. It returns a crawl delay of 0 for this site.
// The correct value is 1 so let's make sure we do that so they don't block us.
await setTimeout(1000)
await page.goto(editorialBoardLink)
const members = await page.$eval('.stJournal', (el) => {
return el.innerText.split('\n')
.filter((val) => /San Francisco|UCSF/.test(val))
})
printTsvLine(publication.title, publication.link, members, event)
}
}
},
Thieme: {
start: 'https://www.thieme-connect.com/products/ejournals/journals?query=*&sort=TITLE_ALPHA_ASC&rows=318&offset=0',
getData: async (page, event) => {
const pdfjsLib = await import('pdfjs-dist')
debug('Getting Thieme journal names and links...')
const publications = await page.$$eval('.journalTitle a', (el) => {
return el.map((el) => { return { title: el.innerText, link: el.href } })
})
debug(`Found ${publications.length} journal titles...`)
for (const publication of publications) {
debug(`Getting editorial board for ${publication.title}...`)
await page.goto(publication.link)
const links = await page.$$eval('.dropMenu.linkList .tab a', (el) => {
// TODO?: Lots of these pages are in German, so check for the German equivalent of "Editorial Board"?
return el.filter((el) => el.innerText === 'Editorial Board').map((el) => el.href)
})
if (links.length === 0) {
printTsvLine(publication.title, publication.link, ['No editorial board link found'], event)
continue
}
const editorialBoardLink = links[0]
const response = await fetch(editorialBoardLink)
if (!response.ok) {
debug(`Error fetching ${editorialBoardLink}: ${response.status}`)
printTsvLine(publication.title, publication.link, ['Error fetching editorial board, link broken?'], event)
continue
}
if (editorialBoardLink.endsWith('.pdf')) {
let doc
try {
doc = await pdfjsLib.getDocument({ data: await response.arrayBuffer() }).promise
} catch (e) {
debug(`Error reading PDF: ${e}`)
printTsvLine(publication.title, publication.link, ['Error reading PDF, link broken?'], event)
continue
}
let currPage = 1 // Pages are 1-based not 0-based
const numPages = doc.numPages
let fullText = ''
while (currPage <= numPages) {
const pdfPage = await doc.getPage(currPage)
const content = await pdfPage.getTextContent()
content.items.forEach(function (item) {
fullText += ' ' + item.str
})
currPage++
}
if (/San Francisco|UCSF/.test(fullText)) {
printTsvLine(publication.title, publication.link, ['Check PDF'], event)
} else {
printTsvLine(publication.title, publication.link, [''], event)
}
} else {
await page.goto(editorialBoardLink)
const boardText = await page.$eval('body', (el) => el.innerText)
if (/San Francisco|UCSF/.test(boardText)) {
printTsvLine(publication.title, publication.link, ['Check editorial board web page'], event)
} else {
printTsvLine(publication.title, publication.link, [''], event)
}
}
}
}
},
Wiley: {
start: 'https://onlinelibrary.wiley.com/action/showPublications?PubType=journal&startPage=&alphabetRange=a',
getData: async (page, event) => {
const publicationsList = []
let currentLetter = 'a'
debug('Getting Wiley journal names and links...')
let pageLoaded = true
while (pageLoaded) {
while (pageLoaded) {
const publicationsThisPage = await page.$$eval('a.visitable', (el) => {
return el.map((el) => { return { title: el.innerText, href: el.href } })
})
publicationsList.push(...publicationsThisPage)
const moreLinks = await page.$$eval('a.pagination__btn--next', (el) => {
return el.map(el => el.href)
})
if (moreLinks.length > 0) {
await page.goto(moreLinks[0])
pageLoaded = true
} else {
pageLoaded = false
}
}
currentLetter = String.fromCharCode(currentLetter.charCodeAt(0) + 1)
if (currentLetter <= 'z') {
await page.goto(`https://onlinelibrary.wiley.com/action/showPublications?PubType=journal&startPage=&alphabetRange=${currentLetter}`)
pageLoaded = true
}
}
for (const publication of publicationsList) {
try {
await page.goto(publication.href)
const editorialBoardLink = await page.$$eval(
'a.sub-menu-item',
(el) => el.filter((el) => el.innerText === 'Editorial Board').map((el) => el.href)[0]
)
if (typeof editorialBoardLink !== 'string') {
debug(`No editorial board link found for ${publication.title} (${publication.href})`)
continue
}
await page.goto(editorialBoardLink)
const members = await page.$eval('body', (el) =>
el.innerText.split('\n').filter((val) => /San Francisco|UCSF/.test(val)).map((val) => val.replaceAll(',', ' '))
)
printTsvLine(publication.href, publication.title, members, event)
} catch (e) {
debug(`Error getting editorial board for ${publication.title} (${publication.href})`)
debug(e)
}
}
}
}
}
module.exports = workflows