-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping.js
49 lines (49 loc) · 1.54 KB
/
scraping.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
const { parse } = require("node-html-parser");
const axios = require("axios");
const { unwantedChars } = require("./utils/unwantedChars");
const { showMoreContent } = require("./utils/showMoreContent");
const Scrape = require("./mongo/models");
const proxy = {
proxy: { port: 8118, host: "tor-proxy" },
};
exports.scrape = async (
url,
divSelector,
authorSelector,
titleSelector,
contentSelector,
dateSelector
) => {
const page = await axios.get(url, proxy);
const htmlString = page.data;
const parsedHtml = parse(htmlString);
const elements = parsedHtml.querySelectorAll(divSelector);
for (let elem of elements) {
const authorSec = elem.querySelector(authorSelector);
const titleSec = elem.querySelector(titleSelector);
const moreContentHref = elem.querySelector("a");
const dateSec = elem.querySelector(dateSelector);
if (!authorSec || !titleSec || !moreContentHref || !dateSec) continue;
try {
const contentSec = await showMoreContent(
moreContentHref.attrs.href,
contentSelector
);
const author = authorSec.innerText.match(/(?<=by\s)(.*)(?=\sat)/)[0];
const title = unwantedChars(titleSec.innerText);
const content = unwantedChars(contentSec.innerText);
const date = dateSec.innerText.match(/(?<=at\s)(.*)/)[0];
const entry = new Scrape({
author,
title,
content,
date: new Date(date),
});
if (!(await Scrape.exists({ title, content, author }))) {
await entry.save();
}
} catch {
continue;
}
}
};