-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.js
153 lines (109 loc) · 4.82 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
'use strict';
// This project takes advantage of Promises in ES6.
// Promise.all() is used to wait until each shirtURL is scrapped and resolved.
// Process:
// 1) Try to accesss shirts4mike.com
// 2) Parse body of website--looking for t-shirt URLs
// 3) Promise that each shirt url will be retrieved and then pushed into an arry of objects
// 4) Write information from array to CSV file
// 5) Log errors to scrapper-error.log
//Dependancies
const csvWriter = require('csv-write-stream');
const request = require('request');
const cheerio = require('cheerio');
const moment = require('moment');
const fs = require('fs');
const os = require('os');
//URLs for the website shirts4mike.com
const rootURL = 'http://shirts4mike.com/'
const allShirtURL = 'http://shirts4mike.com/shirts.php';
//Create new date with proper format using "moment" package
const date = moment().format("YYYY-MM-DD");
//Add all shirt objects into this array--to later write to file
const infoToWrite = [];
const writer = csvWriter();
const ErrorWriter = csvWriter();
//Promise that access is possible to shirts4mike website. Resolve if successsful.
const getShirtURL = new Promise(function(resolve, reject) {
// Check to see if the directory './data' exist. If not, create it.
if (!fs.existsSync('./data')) {
console.log("Creating data folder to store CSV file and log errors...");
fs.mkdirSync('./data');
writer.pipe(fs.createWriteStream('./data/' + date + '.csv'));
}
//Checks to see if http://shirts4mike.com/shirts.php can be accessed
request(allShirtURL, (error, response, body) => {
if (!error && response.statusCode == 200) {
console.log(`Status Code: ${response.statusCode} - OK`);
console.log(`Successfully connected to http://shirts4mike.com/shirts.php${os.EOL}`);
resolve(body);
} else {
//console.log(`There’s been a (${response.statusCode}) error. Cannot connect to the to http://shirts4mike.com.`);
reject(error);
}
})
});
//END of getShirtURL Promise
// Takes the body and targets each shirt URL--adding each scrapped URL to an array
const scrapeBody = function(body) {
const urlArray = [];
const $ = cheerio.load(body);
$('.products a').each(function(i, elem) {
urlArray[i] = rootURL + $(this).attr('href');
});
urlArray.join(', ');
return urlArray;
}
// Waits until information is retrieved (using Promise.all) and then write the returned object to a file
const getAndWriteShirtInfo = (urlArray) => {
const v = urlArray.map(scrapeShirtInformation)
Promise.all(v).then(values => {
console.log(`${os.EOL}ALL Promises have been resolved! The shirt object array is: ${os.EOL}`);
console.log(infoToWrite);
console.log(`${os.EOL}There were ${infoToWrite.length} shirts scraped from shirts4mike.com`);
for (let index = 0; index < infoToWrite.length; ++index) {
writer.write(infoToWrite[index]);
}
writer.end();
});
}
const scrapeShirtInformation = (url) => {
return new Promise((resolve, reject) => {
request(url, (error, response, body) => {
if (!error) {
const $ = cheerio.load(body);
let price = $('.price').text();
let title = $(".shirt-details h1").text().substr(price.length + 1);
let relativeImageUrl = $(".shirt-picture img").attr("src");
let imageUrl = rootURL + relativeImageUrl;
console.log('Retrieve data for: ' + url);
const scrapeTime = moment().format('ddd MMM Do YYYY h:mm:ss a');
let shirtInfo = {};
shirtInfo.Title = title;
shirtInfo.Price = price;
shirtInfo.ImageUrl = relativeImageUrl;
shirtInfo.Url = imageUrl;
shirtInfo.Time = scrapeTime;
//push shirt info into infoToWrite array
infoToWrite.push(shirtInfo);
resolve();
} else {
reject(error);
}
});
})
}
const catchError = (error) => {
const errorDate = moment().format('ddd MMM Do YYYY h:mm:ss a');
const split = new Date().toString().split(" ");
const timeZoneFormatted = split[split.length - 2] + " " + split[split.length - 1];
//Append error to scrapper-error.log file. If the file doesn't exist it'll be created.
fs.appendFile('./data/scrapper-error.log', `[${errorDate} ${timeZoneFormatted}] ${error.message} ${os.EOL}`, () => {
console.error(`An error has occured while running scrapper.js.${os.EOL}See error information below (or check error log file): ${os.EOL}${error.message}`);
});
}
//Run app (progresses through each Promise/function)
getShirtURL
.then(scrapeBody)
.then(getAndWriteShirtInfo)
.catch(catchError);