-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgumtree_scraper.R
371 lines (308 loc) · 10.6 KB
/
gumtree_scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
#' ---
#' title: "gumtree_scraper"
#' author: "JJayes"
#' date: "27/03/2021"
#' output: html_document
#' ---
#'
## ----setup, include=FALSE-----------------------------------------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(rvest)
library(glue)
library(lubridate)
# to get this as a R script to put online I can use:
# knitr::purl("code/gumtree_scraper.Rmd", documentation = 2)
#'
#' # Purpose
#'
#' Scrape gumtree automatically and store the scraped data on github.
#'
#' ## Strategy
#'
#' Get 1000 ads from each province every day.
#'
#' ### Questions
#'
#' - What should I get from each advert?
#'
#' #### Starting with list of adverts
#'
#' We have 9 provinces. For each landing page there consists a stub, a province, a page, a tag and a page.
#'
#' For [example](https://www.gumtree.co.za/s-cars-bakkies/eastern-cape/page-2/v1c9077l3100197p2)
#'
#' The stub is: `https://www.gumtree.co.za/s-cars-bakkies/`
#'
#' The province is: `eastern-cape/`
#'
#' The page is: `page-2/`
#'
#' The tag is: `v1c9077l3100197`
#'
#' The final page is: `p2`
#'
#' We will make a tibble with the provinces and tags, which are unique.
#'
#'
## -----------------------------------------------------------------------------------------------------------
df <- tibble(
province = c("gauteng",
"western-cape",
"kwazulu+natal",
"eastern-cape",
"mpumalanga",
"limpopo",
"north-west",
"free-state",
"northern-cape"),
tag = c("v1c9077l3100003",
"v1c9077l3100001",
"v1c9077l3100002",
"v1c9077l3100197",
"v1c9077l3100227",
"v1c9077l3100223",
"v1c9077l3100231",
"v1c9077l3100236",
"v1c9077l3100213")
)
df <- df %>%
# create home url from stub, province and tag
mutate(home_url = str_c("https://www.gumtree.co.za/s-cars-bakkies/",
province,
"/",
tag,
"p1"))
#'
#' ### Function to find the last page
#'
#' We can get a maximum of 1000 ads from each province - but we need to know how many pages of adverts there are.
#'
#' This function gets the number of adverts, divides it by 20 and adds one to get the number of pages of adverts. If it is more than 50 pages, the function returns 50 as that is the maximum we can scrape.
#'
## -----------------------------------------------------------------------------------------------------------
# get the number of pages
get_last_page <- function(value){
message(glue("Getting page numbers from {value}"))
html <- read_html(value)
n_ads <- html %>%
html_nodes(".displayResults") %>%
html_text() %>%
str_remove("Results 1 to 20 of") %>%
parse_number()
# total pages
pages_data <- round(n_ads/20) + 1
# can include this if it works well to trim to 50
pages_data <- ifelse(pages_data <= 50, pages_data, 50)
pages_data
}
#'
#' Apply function to each province's landing page and collect the number of pages.
#'
## -----------------------------------------------------------------------------------------------------------
df <- df %>%
# here we use the possibly function to get the last page, and if it fails
mutate(last_page = map(home_url, possibly(get_last_page, "failed")))
# unlist if for some reason. Problem this won't work if we have an error I think, becasue we can't combine the two types.
df <- df %>% mutate(last_page = unlist(last_page))
#'
#' ### Create list of pages to look at for finding ads
#'
#' Look at this nice function! It takes the province, the last page and the tag and then makes a list of pages that we can scrape to get all the adverts. I've called it stick because it sticks strings together.
#'
## -----------------------------------------------------------------------------------------------------------
stick <- function(province, last_page, tag){
str_c("https://www.gumtree.co.za/s-cars-bakkies/",
province,
"/page-",
1:last_page,
"/",
tag,
"p",
1:last_page) %>% as_tibble() %>% nest(data = everything())
}
#'
#' #### I get to use pmap!! How cool!!
#'
#' Here we take the data frame of provine information above and we map across province, last_page and tag so that we can the correct number of pages to grab ads from. Ah so nice!
#'
## -----------------------------------------------------------------------------------------------------------
df <- df %>%
mutate(pages = pmap(list(province, last_page, tag), stick)) %>% unnest(pages) %>% unnest(data) %>%
rename(page_url = value)
#'
#' Now we have our nice list of pages, we can scrape them for the advert links.
#'
#' ### Function to get the ad links from the list of pages
#'
## -----------------------------------------------------------------------------------------------------------
get_ad_links <- function(page_url){
html <- read_html(page_url)
message(glue("Getting links from {page_url}"))
html %>%
html_nodes(".related-ad-title") %>%
html_attr("href")
}
#'
#' ### Iterate through list of links
#'
## -----------------------------------------------------------------------------------------------------------
# creates a list of links from each page
list_of_links <- df %>%
# the possibly here means it will store the error and continue should it hit a problem
mutate(ad_url = map(page_url, possibly(get_ad_links, otherwise = "failed")))
# unnests the list of links for a tibble that isn't compact
list_of_links <-list_of_links %>%
unnest(ad_url) %>%
mutate(ad_url = str_c("https://www.gumtree.co.za", ad_url),
# keep an ad number - always useful to have an index.
ad_number = row_number())
#'
#' ### Save list of links
#'
#' We put in today's date and the time to make it easy to keep track of.
#'
## -----------------------------------------------------------------------------------------------------------
st <- format(Sys.time(), "%Y-%m-%d-%I-%M-%p")
write.csv(list_of_links, paste0("data/links/", st, ".csv", sep = ""))
#'
#' # Scraping the adverts from the ad urls.
#'
## -----------------------------------------------------------------------------------------------------------
get_ad_text_gumtree <- function(ad_url){
# store the html from the page
html <- read_html(ad_url)
message(glue("Getting ad from {ad_url}"))
# site
site <- "Gumtree"
# seller type
seller_type <- ifelse(is.na(html %>%
html_node(".B2C-respond") %>%
html_text()),
"Private or unregistered dealer",
html %>%
html_node(".B2C-respond") %>%
html_text())
# title
title <- html %>%
html_node("h1") %>%
html_text()
# price
price <- html %>%
html_node(".ad-price") %>%
html_text() %>%
parse_number()
# text
text <- html %>%
html_nodes("#revip-description .description-content") %>%
html_text()
# info table
# info_table <- html %>%
# html_nodes(".attribute") %>%
# html_text() %>%
# paste(collapse = " - ")
info_table <- bind_cols(
html %>%
# stats
html_nodes(".attribute .name") %>%
html_text() %>% as_tibble(),
# values
html %>%
html_nodes(".attribute .value") %>%
html_text() %>% as_tibble()) %>%
select(info_cols = 1, info_values = 2) %>% nest(data = everything())
# photos
n_photos <- html %>%
html_node(".count") %>%
html_text() %>%
parse_number()
# views
n_views <- html %>%
html_node(".view-count span") %>%
html_text()
# date of ad
ad_date <- as.character(now()- html %>%
html_node(".vip-stats .creation-date") %>%
html_text() %>%
str_to_lower() %>%
str_remove(" ago") %>%
str_replace_all("a |an ", "1 ") %>%
duration())
# ad_date safe is included in case there is a problem with the syntax above and can be calculated from the scrape time function
# ad_date_safe <- html %>%
# html_node(".vip-stats .creation-date") %>%
# html_text()
# seller name
seller_name <- html %>%
html_node(".seller-name") %>%
html_text()
# seller age
seller_age <- ifelse(is.na(html %>%
html_node(".seller-create-date") %>%
html_text()), html %>%
html_node(".seller-year") %>%
html_text(), html %>%
html_node(".seller-create-date") %>%
html_text())
# all time ads
n_all_time_ads <- ifelse(is.na(html %>%
html_node(".seller-active-ads:nth-child(1) .ads-number-info span") %>%
html_text()),
html %>%
html_node(".icon-ad-view+ .number") %>%
html_text(),
html %>%
html_node(".seller-active-ads:nth-child(1) .ads-number-info span") %>%
html_text())
# active ads
n_active_ads <- ifelse(is.na(html %>%
html_node(".seller-active-ads+ .seller-active-ads span") %>%
html_text()),
html %>%
html_node(".number") %>%
html_text(),
html %>%
html_node(".seller-active-ads+ .seller-active-ads span") %>%
html_text())
# location
location <- str_c((html %>%
html_node(".attribute:nth-child(1)") %>%
html_text() %>%
str_remove("Location:")),
html %>%
html_node(".breadcrumbs span:nth-child(2) span") %>%
html_text(),
sep = ", ")
# scrape date and time
scrape_time <- format(Sys.time(), "%Y-%m-%d-%I-%M-%p")
tibble(site, seller_type, title, price, text, info_table,
n_photos, n_views, ad_date, seller_name,
location, seller_age, n_all_time_ads,
n_active_ads, scrape_time)
}
#'
#' ### Map through each ad_url
#'
## ---- warning=F---------------------------------------------------------------------------------------------
# mapping through each url
ads_nested <- list_of_links %>%
distinct(ad_url, .keep_all = T) %>%
mutate(text = map(ad_url, possibly(get_ad_text_gumtree, "failed")))
ads <- ads_nested %>%
filter(text != "failed") %>%
unnest(cols = c(text)) %>%
unnest(text) %>%
unnest(data) %>%
pivot_wider(names_from = info_cols, values_from = info_values)
#'
#' ### Write data to output
#'
## -----------------------------------------------------------------------------------------------------------
st <- format(Sys.time(), "%Y-%m-%d-%I-%M-%p")
write_rds(ads, paste0("data/raw/ads_", st, ".rds"), compress = "gz")
#'
#' Stores data in folder called latest as a csv for the shiny app to pull in.
#'
## -----------------------------------------------------------------------------------------------------------
write.csv(ads, "data/latest/ads_latest.csv")
#'