-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
188 lines (171 loc) · 6.19 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
// This code refactors a script to scrape LinkedIn profile details. It requires the 'fs', 'path', and 'puppeteer' modules.
const fs = require('fs');
const { resolve } = require('path');
const puppeteer = require('puppeteer');
const { stringify } = require('csv-stringify/sync');
require('dotenv').config();
(async () => {
// Launch a browser and create a new page
const browser = await puppeteer.launch({
// headless: true,
headless: false,
devtools: false,
slowMo: 0,
args: ['--window-size=1920,1080'],
});
const page = await browser.newPage();
await page._client.send('Emulation.clearDeviceMetricsOverride');
// Navigate to LinkedIn home page
console.time('LinkedIn login');
await page.goto('https://www.linkedin.com/home');
await page.setViewport({ width: 1680, height: 866 });
await page.waitForSelector('#session_key');
await page.click('#session_key');
await page.type('#session_key', process.env.LINKEDIN_USERNAME);
await page.waitForSelector('#session_password');
await page.click('#session_password');
await page.type('#session_password', process.env.LINKEDIN_PASSWORD);
await page.waitForSelector(
'div.flex.justify-between.sign-in-form__footer--full-width > button',
);
await page.click(
'div.flex.justify-between.sign-in-form__footer--full-width > button',
);
await page.waitForSelector('.feed-identity-module__member-bg-image');
console.timeEnd('LinkedIn login');
// Read profile URLs from file
const profileUrls = fs.readFileSync(
resolve(process.cwd(), 'urls.txt'),
'utf8',
);
// Iterate over profile URLs and scrape profile details
const profiles = [];
for (const profileUrl of profileUrls.split('\n').filter(Boolean)) {
console.time('Get profile details at URL ' + profileUrl);
const profile = await extractProfileDetails(page, profileUrl);
profiles.push(profile);
console.timeEnd('Get profile details at URL ' + profileUrl);
}
// Write profile details to a JSON file
fs.writeFileSync(`./results.json`, JSON.stringify(profiles, null, 2));
// Write profile details to a CSV file
const csv = stringify(profiles, { delimiter: ',', quote: '"', header: true });
console.log(csv);
fs.writeFileSync(`./results.csv`, csv);
process.exit(0);
})();
// Helper function to extract profile details from a given URL
async function extractProfileDetails(page, profileUrl) {
console.log('Get profile details at URL ' + profileUrl);
try {
await page.goto(profileUrl);
} catch (e) {
console.error('page goto', e);
throw new Error('Error connecting');
}
console.log('Page loaded');
// Wait for the profile to load
try {
await page.waitForSelector('#ember31 > h1');
} catch (e) {
console.error('page waitForSelector', e);
throw new Error('Error loading profile');
}
// Extract profile details
const [name] = await extractText({
selectorPath: '#ember31 > h1',
});
console.log('name', name);
const [title] = await extractText({
selectorPath: '#profile-content div.pv-text-details__left-panel--full-width > div.text-body-medium.break-words',
});
console.log('title', title);
let [companyUrl] = await extractHref({
selectorPath: '[data-field="experience_company_logo"]',
});
console.log('companyUrl', companyUrl);
let companyName;
companyName = (await extractText({
xpath: '/html/body/div[6]/div[3]/div/div/div[2]/div/div/main/section[5]/div[3]/ul/li[1]/div/div[2]/div[1]/div/span[1]/span[1]',
}))?.[0];
console.log('companyName attempt 1', companyName);
if (!companyName) {
companyName = (await extractText({
selectorPath: '.pvs-entity div span.t-normal span',
}))?.[0];
console.log('companyName attempt 2', companyName);
}
// Load the real company URL
try {
console.time('Go to company page');
console.log('Go to company page', companyUrl);
await page.goto(companyUrl, { timeout: 10000 });
console.log('Go to company about page', page.url() + '/about');
await page.goto(page.url() + '/about', { timeout: 10000 });
companyUrl = (await extractHref({
xpath: `//dl/dd[1]/a`,
}))?.[0];
console.log('companyUrl', companyUrl);
console.timeEnd('Go to company page');
} catch (e) {
console.error('page goto', e);
}
return { companyName: companyName?.replace(/ logo$/, ''), companyUrl, name, title };
// Helper function to extract text from a given selector
async function extractText({ xpath, selectorPath }) {
try {
xpath
? await page.waitForXPath(xpath, { timeout: 3000 })
: await page.waitForSelector(selectorPath, { timeout: 3000 });
const handles = xpath
? await page.$x(xpath)
: await page.$$(selectorPath);
return await Promise.all(
handles.map((cellHandle) =>
page.evaluate(
(cell) => cell.textContent?.replace(/\s\s+/g, ' ').trim(),
cellHandle,
),
),
);
} catch (e) {
console.error(`extractText timeout ${{ xpath, selectorPath }}`, e);
}
}
// Helper function to extract href from a given selector
async function extractHref({ xpath, selectorPath }) {
try {
xpath
? await page.waitForXPath(xpath, { timeout: 10000 })
: await page.waitForSelector(selectorPath, { timeout: 10000 });
const handles = xpath
? await page.$x(xpath)
: await page.$$(selectorPath);
return await Promise.all(
handles.map((cellHandle) =>
page.evaluate((cell) => cell.getAttribute('href'), cellHandle),
),
);
} catch (e) {
console.error(`extractHref timeout ${{ xpath, selectorPath }}`, e);
}
}
// Helper function to extract alt from a given selector
async function extractAlt({ xpath, selectorPath }) {
try {
xpath
? await page.waitForXPath(xpath, { timeout: 10000 })
: await page.waitForSelector(selectorPath, { timeout: 10000 });
const handles = xpath
? await page.$x(xpath)
: await page.$$(selectorPath);
return await Promise.all(
handles.map((cellHandle) =>
page.evaluate((cell) => cell.getAttribute('alt'), cellHandle),
),
);
} catch (e) {
console.error(`extractAlt timeout ${{ xpath, selectorPath }}`, e);
}
}
}