-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.js
94 lines (86 loc) · 2.46 KB
/
crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const fetch = require("node-fetch");
//taking input from the user
// const readline = require("readline");
async function crawlPage(baseURL, currentURL, pages, visitedUrls) {
// if this is an offsite URL, bail immediately
if (!visitedUrls) {
visitedUrls = new Set(); // Initialize the set if it's not provided
}
const currentUrlObj = new URL(currentURL);
const baseUrlObj = new URL(baseURL);
if (currentUrlObj.hostname !== baseUrlObj.hostname) {
return pages;
}
const normalizedURL = normalizeURL(currentURL);
// Check if the URL has already been visited
if (visitedUrls.has(normalizedURL)) {
return pages;
}
// Mark the URL as visited
visitedUrls.add(normalizedURL);
// Initialize or increment the page count
if (!pages[normalizedURL]) {
pages[normalizedURL] = 1;
} else {
pages[normalizedURL]++;
}
console.log(`Crawling ${currentURL}`);
let htmlBody = "";
try {
const resp = await fetch(currentURL);
if (resp.status > 399) {
console.log("HTTP Error: Goodbye cruel world :(");
return pages;
}
const contentType = resp.headers.get("content-type");
if (!contentType.includes("text/html")) {
console.log("Not HTML: You're dead to me");
return pages;
}
htmlBody = await resp.text();
} catch (err) {
console.log(err);
return pages;
}
const urls = getURLsFromHTML(htmlBody, baseURL);
for (const url of urls) {
pages = await crawlPage(baseURL, url, pages, visitedUrls);
}
return pages; // Return the updated pages object
}
const normalizeURL = (url) => {
const newURL = new URL(url);
let fullPath = `${newURL.host}${newURL.pathname}`;
if (fullPath.length > 0 && fullPath.slice(-1) === "/") {
fullPath = fullPath.slice(0, -1);
}
return fullPath;
};
function getURLsFromHTML(htmlBody, baseURL) {
const urls = [];
const dom = new JSDOM(htmlBody);
const aElements = dom.window.document.querySelectorAll("a");
for (const aElement of aElements) {
if (aElement.href.slice(0, 1) === "/") {
try {
urls.push(new URL(aElement.href, baseURL).href);
} catch (err) {
console.log(`${err.message}: ${aElement.href}`);
}
} else {
try {
urls.push(new URL(aElement.href).href);
} catch (err) {
console.log(`${err.message}: ${aElement.href}`);
}
}
}
return urls;
}
module.exports = {
normalizeURL,
getURLsFromHTML,
crawlPage,
};