-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
150 lines (126 loc) · 5.13 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
const puppeteer = require('puppeteer');
async function handleModalsDynamically(page) {
// Remove existing modals based on common patterns
await page.evaluate(() => {
const modals = document.querySelectorAll('[role="dialog"], [aria-modal="true"], div[class*="modal"], div[class*="popup"], div[class*="overlay"]');
modals.forEach(modal => modal.remove());
});
// Use MutationObserver to watch for new modals
await page.evaluate(() => {
const observer = new MutationObserver(mutations => {
mutations.forEach(mutation => {
if (mutation.addedNodes.length) {
mutation.addedNodes.forEach(node => {
if (node.matches && (node.matches('[role="dialog"], [aria-modal="true"], div[class*="modal"], div[class*="popup"], div[class*="overlay"]'))) {
node.remove();
}
});
}
});
});
observer.observe(document.body, { childList: true, subtree: true });
});
// Fallback method: Pressing the Escape key
await page.keyboard.press('Escape').catch(e => console.log('Error pressing Escape:', e.message));
// Fallback method: Clicking outside the modal
await page.mouse.click(0, 0).catch(e => console.log('Error clicking outside modal:', e.message));
}
async function waitForContentChange(page, selector, previousContent) {
await page.waitForFunction(
(selector, previousContent) => {
const currentContent = document.querySelector(selector).textContent.trim();
return currentContent !== previousContent;
},
{},
selector,
previousContent
);
}
async function yotpoScraper(url) {
let browser;
try {
const selectors = {
yotpo: 'div.yotpo.yotpo-main-widget',
reviews: 'div.yotpo-reviews',
review: 'div.yotpo-review',
name: 'span.yotpo-user-name',
rating: 'div.yotpo-review-stars span.sr-only',
title: 'div.yotpo-main div.content-title',
desc: 'div.content-review',
date: 'span.yotpo-review-date',
pager: 'div.yotpo-pager[data-total]',
next: 'div.yotpo-pager a[rel^=next]',
};
browser = await puppeteer.launch({
// headless: 'new',
headless: false,
});
const page = await browser.newPage();
await page.setViewport({
width: 1280,
height: 1024,
});
// Block Yotpo analytics requests
await page.setRequestInterception(true);
page.on('request', request => {
if (request.url().includes('https://p.yotpo.com/i?e=se&se_ca=reviews&se_ac=shown&se_psk')) {
request.abort();
} else {
request.continue();
}
});
// Log messages from the browser's console.
// page.on('console', message => console.log(message.text()));
await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitForSelector(selectors.reviews);
await handleModalsDynamically(page);
// const html = await page.evaluate(selector => document.querySelector(selector.reviews).innerHTML, selectors);
const reviewsTotal = await page.evaluate(selector => document.querySelector(selector.pager).getAttribute('data-total'), selectors);
const reviewsPerPage = await page.evaluate(selector => document.querySelector(selector.pager).getAttribute('data-per-page'), selectors);
const reviewsPages = Math.ceil(reviewsTotal / reviewsPerPage);
console.log('Total reviews:', reviewsTotal);
console.log('Reviews per page:', reviewsPerPage);
console.log('Pages:', reviewsPages);
let reviewsArr = [];
// while() { 'div.yotpo-pager a:not[.yotpo-disabled]' }
for (let p = 1; p < reviewsPages + 1; p += 1) {
console.log('Getting page:', p);
const d = await page.evaluate((selector) => {
const reviews = document.querySelectorAll(selector.review);
const data = [];
for (let r = 0; r < reviews.length; r += 1) {
// const reviewNumber = data.length + 1;
data.push({
name: document.querySelectorAll(selector.name)[r].textContent.trim(),
rating: document.querySelectorAll(selector.rating)[r].textContent.trim(),
title: document.querySelectorAll(selector.title)[r].textContent.trim(),
desc: document.querySelectorAll(selector.desc)[r].textContent.trim(),
date: document.querySelectorAll(selector.date)[r].textContent.trim(),
});
}
return data;
}, selectors);
console.log('Got', d.length, 'reviews from page', p);
// add reviews to array
reviewsArr = [...reviewsArr, ...d];
// if not last page in pagination, click to next page
if (reviewsPages !== p) {
const previousContent = await page.$eval(selectors.review, el => el.textContent.trim());
await page.click(selectors.next);
await page.waitForResponse(response => {
return response.url().includes('https://staticw2.yotpo.com/batch/app_key') && response.status() === 200;
});
await waitForContentChange(page, selectors.review, previousContent);
}
}
return reviewsArr;
} catch (error) {
console.log(`Error: ${error}`);
return [];
} finally {
if (browser) {
await browser.close();
}
}
}
exports.yotpoScraper = yotpoScraper;