-
Notifications
You must be signed in to change notification settings - Fork 1
/
webscrape_util.R
107 lines (83 loc) · 2.73 KB
/
webscrape_util.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
########################################
## utility functions for web scraping ##
########################################
## download xlsx as temp file and load as df
## note: read.csv can direclty read csv file from url
# dependendy: httr, readxl
load_xlsx_from_url<-function(url,skip=0){
GET(url,write_disk(dat <- tempfile(fileext = ".xlsx")))
dat<-read_xlsx(dat,skip=skip)
return(dat)
}
## print link for LOINC code search result
get_loinc_ref<-function(loinc){
#url to loinc.org
url<-paste0(paste0("https://loinc.org/",loinc))
#return the link
return(url)
}
## get loinc name
get_loinc_nm<-function(loinc){
#url to fhir.loinc.org
url<-paste0("https://fhir.loinc.org/CodeSystem/$lookup?system=http://loinc.org&code=",loinc)
loinc_obj<-getURL(url = url,
userpwd="sxinger:2019Sxdh=12171316",
httpauth = 1L)
}
## pring link for RXNORM codes search result
get_rxcui_nm<-function(rxcui){
#url link to REST API
rx_url<-paste0("https://rxnav.nlm.nih.gov/REST/rxcui/",rxcui,"/")
#get and parse html object
rxcui_obj <- getURL(url = rx_url)
rxcui_content<-htmlParse(rxcui_obj)
#extract name
rxcui_name<-xpathApply(rxcui_content, "//body//rxnormdata//idgroup//name", xmlValue)
if (length(rxcui_name)==0){
rxcui_name<-NA
}else{
rxcui_name<-unlist(rxcui_name)
}
return(rxcui_name)
}
get_ndc_nm<-function(ndc){
#url link to REST API
rx_url<-paste0("https://ndclist.com/?s=",ndc)
#get and parse html object
rx_obj<-getURL(url = rx_url)
if (rx_obj==""){
rx_name<-NA
}else{
#extract name
rx_content<-htmlParse(rx_obj)
rx_attr<-xpathApply(rx_content, "//tbody//td[@data-title]",xmlAttrs)
rx_name<-xpathApply(rx_content, "//tbody//td[@data-title]",xmlValue)[which(rx_attr=="Proprietary Name")]
rx_name<-unlist(rx_name)
if(length(rx_name) > 1){
rx_name<-rx_url
}
}
return(rx_name)
}
#ref: https://www.r-bloggers.com/web-scraping-google-urls/
google_code<-function(code,nlink=1){
code_type<-ifelse(gsub(":.*","",code)=="CH","CPT",
gsub(":.*","",code))
code<-gsub(".*:","",code)
#search on google
gu<-paste0("https://www.google.com/search?q=",code_type,":",code)
html<-getURL(gu)
#parse HTML into tree structure
doc<-htmlParse(html)
#extract url nodes using XPath. Originally I had used "//a[@href][@class='l']" until the google code change.
attrs<-xpathApply(doc, "//h3//a[@href]", xmlAttrs)
#extract urls
links<-sapply(attrs, function(x) x[[1]])
#only keep the secure links
links<-links[grepl("(https\\:)+",links)]
links<-gsub("(\\&sa=U).*$","",links)
links<-paste0("https://",gsub(".*(https://)","",links))
#free doc from memory
free(doc)
return(links[1])
}