-
Notifications
You must be signed in to change notification settings - Fork 5
/
crypto-scrape.R
91 lines (87 loc) · 3.78 KB
/
crypto-scrape.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
library(jsonlite)
library(plyr)
library(dplyr)
library(doSNOW)
library(doParallel)
library(lubridate)
# Functions ---------------------------------------------------------------
# Retrieve Coin Listings -----
getCoins <- function() {
library(plyr)
today <- gsub("-", "", today())
json <- "https://files.coinmarketcap.com/generated/search/quick_search.json"
coins <- jsonlite::read_json(json, simplifyVector = TRUE)
coins <- data_frame(symbol = coins$symbol, name = coins$name, slug = coins$slug,
rank = coins$rank)
length <- as.numeric(length(coins$slug))
range <- 1:length
url <- paste0("https://coinmarketcap.com/currencies/", coins$slug, "/historical-data/?start=20130428&end=",
today)
baseurl <- c(url)
coins$slug <- as.character(baseurl)
coins$rank <- as.numeric(coins$rank)
return(coins)
}
# Scrape Historical Tables -----
abstracts <- function(attributes) {
page <- read_html(attributes)
names <- page %>% html_nodes(css = ".col-sm-4 .text-large") %>% html_text(trim = TRUE) %>%
replace(!nzchar(.), NA)
nodes <- page %>% html_nodes(css = "table") %>% .[1] %>% html_table(fill = TRUE) %>%
replace(!nzchar(.), NA)
abstracts <- Reduce(rbind, nodes)
abstracts$symbol <- gsub("\\(||\\n|\\)|\\s\\s", "", names)
abstracts$symbol <- as.character(strsplit(abstracts$symbol, " ")[[1]][1])
return(abstracts)
}
# Cleanup results table -----
cleanUp <- function(results) {
names(results) <- c("symbol", "date", "open", "high", "low", "close", "volume",
"market", "name", "ranknow")
marketdata <- results
marketdata$volume <- gsub("\\,", "", marketdata$volume)
marketdata$market <- gsub("\\,", "", marketdata$market)
marketdata$volume <- gsub("\\-", "0", marketdata$volume)
marketdata$market <- gsub("\\-", "0", marketdata$market)
marketdata$close <- gsub("\\-", "0", marketdata$close)
marketdata$date <- format(strptime(marketdata$date, format = "%b %d,%Y"), "%Y-%m-%d")
marketdata$open <- as.numeric(marketdata$open)
marketdata$close <- as.numeric(marketdata$close)
marketdata$high <- as.numeric(marketdata$high)
marketdata$low <- as.numeric(marketdata$low)
marketdata$volume <- as.numeric(marketdata$volume)
marketdata$market <- as.numeric(marketdata$market)
# Percent variance between open and close rates
marketdata$variance <- ((marketdata$close - marketdata$open)/marketdata$close)
# spread variance between days high, low and closing
marketdata$volatility <- ((marketdata$high - marketdata$low)/marketdata$close)
return(marketdata)
}
# START CRYPTOCURRENCY SCRAPING SCRIPT ------------------------------------
# Crypto Scraping Setup ---------------------------------------------------
file <- "~/Desktop/Crypto-Markets.csv"
coins <- getCoins()
length <- as.numeric(length(coins$slug))
range <- 1:length
cpucore <- as.numeric(detectCores(all.tests = FALSE, logical = TRUE))
ptm <- proc.time()
# Uncomment for fiat exchange rate ----- exchange_rate <-
# fromJSON('https://api.fixer.io/latest?base=USD') AUD <- exchange_rate$rates$AUD
# Parallel process scraping with progress bar -----------------------------
cluster = makeCluster(cpucore, type = "SOCK")
registerDoSNOW(cluster)
pb <- txtProgressBar(max = length, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)
attributes <- coins$slug
# Combine results and stop clusters ---------------------------------------
results = foreach(i = range, .options.snow = opts, .combine = rbind, .packages = "rvest") %dopar%
abstracts(attributes[i])
close(pb)
stopCluster(cluster)
# Cleanup results and fix names -------------------------------------------
coinnames <- data_frame(symbol = coins$symbol, name = coins$name, rank = coins$rank)
results <- merge(results, coinnames)
marketdata <- cleanUp(results)
write.csv(marketdata, file, row.names = FALSE)
print(proc.time() - ptm)