-
Notifications
You must be signed in to change notification settings - Fork 1
/
texts.js
98 lines (90 loc) · 2.65 KB
/
texts.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"use strict";
const fs = require("fs");
const osmosis = require("osmosis");
const { chain } = require("stream-chain");
const options = {
//reports + motions(?)
// url:"http://www.europarl.europa.eu/plenary/en/texts-submitted.html",
report:
"http://www.europarl.europa.eu/oeil/search/search.do?searchTab=y&q=documentEP:D-A8-????/????&snippet=true&noHeader=false&lang=en&dismax=y&all&limit=3000",
motion:
"http://www.europarl.europa.eu/oeil/search/search.do?searchTab=y&q=documentEP:D-B8-????/????&snippet=true&noHeader=false&lang=en&dismax=y&all&limit=4000",
};
const promises = [];
const pipes = {};
pipes["report"] = streamCSV("data/text_tabled.csv");
promises.push(
new Promise((resolve, reject) => {
pipes["report"].on("close", () => resolve);
})
);
[2021, 2020, 2019, 2018, 2017, 2016, 2014].forEach((yy) => {
scrape(options.report.replace("/????", "/" + yy), {}, "report").then(() => {
console.log("scraped report 8th term " + yy);
});
});
/*
*/
scrape(options.motion, {}, "motion").then(() => {
console.log("scraped motions 8th term");
});
Promise.all(promises).then(() => {
for (var i in pipes) {
pipes[i].end(); //closing
}
console.log("all finished");
});
function streamCSV(file, header) {
const head = "reference,type,name,rapporteur,committee,intra,oeil,doc".split(
","
);
const csvwriter = require("csv-write-stream")({
separator: ",",
headers: head,
sendHeaders: true,
});
function row(d) {
d.rapporteur = d.rapporteur ? d.rapporteur.join("|") : "";
return d;
}
const pipeline = chain([row, csvwriter, fs.createWriteStream(file)]);
pipeline.on("close", () => console.log("close" + file));
return pipeline;
}
function scrape(docurl, param, type) {
return new Promise((resolve, reject) => {
param = param || { timeout: 3000 };
osmosis
.get(docurl, param)
.log(console.log)
.error(console.log)
.find(".single_result")
.set({
name: ".procedure_title",
reference: "td em",
oeil: ".procedure_title a@href",
intra: ".rssEntry_title",
doc: ".rssEntry_title a@href",
more: [".rssEntry_row_value_item"],
//'urls':['.documents a@href']
})
.then((context, d) => {
d.type = type;
if (d.more) {
d.rapporteur = d.more[1];
d.committee = d.more[0];
}
if (d.rapporteur) {
d.rapporteur = d.rapporteur.split(",");
d.rapporteur.forEach((r, i) => {
d.rapporteur[i] = r.trim();
});
}
pipes["report"].write(d);
})
.done((d) => {
console.log("done");
resolve(docurl);
});
});
}