-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
executable file
·134 lines (112 loc) · 3.74 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* eslint-disable no-console */
const puppeteer = require('puppeteer');
const BibtexParser = require('bib2json');
const path = require('path');
const fs = require('fs').promises;
const outDir = 'out_json';
const inDir = 'in_bib';
// You can use a single file or an array of files as input
// Single file
const year = 21; // uist21
const name = `uist${year}`;
const files = [path.join(inDir, `${name}.bib`)];
// Array of files.
// This however might not work as the online server will block your IP
// if you exceed a number of requests
// const files = ['test1.bib', 'test2.bib']
scrap(files).then((res) => {
const output = path.join(outDir, `${name}.json`);
// if any array of one, print the object only
let result = res; // the array of proceedings
if (result.length === 1) result = result[0];
fs.writeFile(output, JSON.stringify(result), (err) => {
if (err) return console.log(err);
});
});
// Helpers
function getFileName(filename) {
const fname = filename.split('.').slice(0, -1)[0];
return fname.split('/').pop();
}
async function parse(filename) {
const text = await fs.readFile(path.join(filename), 'utf8');
const data = BibtexParser(text);
const papers = [];
for (let i = 0; i < data.entries.length; i += 1) {
const paperData = data.entries[i].Fields;
const paper = await getPaperData(paperData.url);
paper.url = paperData.url;
paper.title = paperData.title;
console.log(`Paper ${i} of ${data.entries.length} in ${filename} with url ${paper.url}`);
papers.push(paper);
}
return papers;
}
function getPaperData(url) {
return new Promise((resolve) => {
// starting Puppeteer
puppeteer.launch({
headless: true,
ignoreHTTPSErrors: true,
args: ['--window-size=1920,1080'], // tricky bug... in mobile version not all authors are included
defaultViewport: {
width: 1920,
height: 1080,
},
}).then(async (browser) => {
// opening a new page and navigating to Reddit
const page = await browser.newPage();
await page.goto(url, {
waitUntil: 'load',
// Remove the timeout
timeout: 0,
});
await page.waitForSelector('body');
// manipulating the page's content
const dataPaper = await page.evaluate(() => {
const result = { author: [], institution: [] };
const author = document.body.querySelectorAll('.loa__author-name');
const institution = document.body.querySelectorAll('.loa_author_inst:not(.hidden)'); // before 2021 was '.auth-institution'
author.forEach((element) => {
result.author.push(element.outerText);
});
institution.forEach((element) => {
const cleanedText = element.innerText.replace('\n', '').replace(' ', '').trim();
result.institution.push(cleanedText);
});
result.author = [...new Set(result.author)];
return result;
});
dataPaper.country = toCountries([...new Set(dataPaper.institution)]);
// closing the browser
await browser.close();
resolve(dataPaper);
}).catch((err) => {
console.error(err);
});
});
}
function toCountries(list) {
const result = [];
for (const el of list) {
const country = el.split(',').slice(-1)[0].trim();
result.push(country);
}
return [...new Set(result)];
}
function scrap(inputFiles) {
return new Promise((resolve) => {
const actions = inputFiles.map(parse);
const results = Promise.all(actions); // pass array of promises
results.then((data) => {
const all = [];
for (let i = 0; i < inputFiles.length; i += 1) {
const proc = {};
proc.proceedings = getFileName(inputFiles[i]);
proc.papers = data[i];
all.push(proc);
}
resolve(all);
});
});
}