diff --git a/DESCRIPTION b/DESCRIPTION index 64ed91b8..cf5960a2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,10 @@ Suggests: testthat, httr, magrittr, readr, - xml2 + xml2, + dplyr, + purrr, + printr VignetteBuilder: knitr keywords: metadata, codemeta, ropensci, citation, credit affiliation: https://ropensci.org diff --git a/README.md b/README.md index 18545b8e..8a93c95f 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,39 @@ write_codemeta(".") "name": "Central R Archive Network (CRAN)", "url": "https://cran.r-project.org" } + }, + { + "@type": "SoftwareApplication", + "identifier": "dplyr", + "name": "dplyr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Central R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + } + }, + { + "@type": "SoftwareApplication", + "identifier": "purrr", + "name": "purrr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Central R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + } + }, + { + "@type": "SoftwareApplication", + "identifier": "printr", + "name": "printr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Central R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + } } ], "softwareRequirements": [ @@ -261,10 +294,10 @@ write_codemeta(".") "keywords": ["metadata", "codemeta", "ropensci", "citation", "credit"], "relatedLink": "https://codemeta.github.io/codemetar", "contIntegration": "https://travis-ci.org/codemeta/codemetar", - "developmentStatus": "wip", + "developmentStatus": "active", "releaseNotes": "https://github.com/codemeta/codemetar/blob/master/NEWS.md", "readme": "https://github.com/codemeta/codemetar/blob/master/README.md", - "fileSize": "366.241KB" + "fileSize": "397.768KB" } Modifying or enriching CodeMeta metadata diff --git a/codemeta.json b/codemeta.json index 501aaa4d..ed63174f 100644 --- a/codemeta.json +++ b/codemeta.json @@ -147,6 +147,39 @@ "name": "Central R Archive Network (CRAN)", "url": "https://cran.r-project.org" } + }, + { + "@type": "SoftwareApplication", + "identifier": "dplyr", + "name": "dplyr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Central R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + } + }, + { + "@type": "SoftwareApplication", + "identifier": "purrr", + "name": "purrr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Central R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + } + }, + { + "@type": "SoftwareApplication", + "identifier": "printr", + "name": "printr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Central R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + } } ], "softwareRequirements": [ @@ -223,8 +256,8 @@ "keywords": ["metadata", "codemeta", "ropensci", "citation", "credit"], "relatedLink": "https://codemeta.github.io/codemetar", "contIntegration": "https://travis-ci.org/codemeta/codemetar", - "developmentStatus": "wip", + "developmentStatus": "active", "releaseNotes": "https://github.com/codemeta/codemetar/blob/master/NEWS.md", "readme": "https://github.com/codemeta/codemetar/blob/master/README.md", - "fileSize": "366.241KB" + "fileSize": "397.768KB" } diff --git a/docs/LICENSE.html b/docs/LICENSE.html index fb08ed29..4772dc6b 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -70,6 +70,9 @@
Here we illustrate some example use cases that involve parsing codemeta data.
+library(jsonld)
+library(jsonlite)
+library(magrittr)
+library(codemetar)
+library(purrr)
+library(dplyr)
+library(printr)
We start with a simple example from the codemeta.json
file of codemetar
itself. First, we’ll just generate a copy of the codemeta record for the package:
write_codemeta("codemetar", "codemeta.json")
We then digest thus input using a JSON-LD “frame.” While not strictly necessary, this helps ensure the data matches the format we expect, even if the original file had errors or missing data. See the vignette “Validating in JSON-LD” in this package and the official JSON-LD docs for details). The codemetar
package includes a reasonably explicit frame to get us started:
frame <- system.file("schema/frame_schema.json", package="codemetar")
+
+meta <-
+ jsonld_frame("codemeta.json", frame) %>%
+ fromJSON(FALSE) %>% getElement("@graph") %>% getElement(1)
Construct a citation
+authors <-
+lapply(meta$author,
+ function(author)
+ person(given = author$given,
+ family = author$family,
+ email = author$email,
+ role = "aut"))
+year <- meta$datePublished
+if(is.null(year))
+ year <- format(Sys.Date(), "%Y")
+bibitem <-
+ bibentry(
+ bibtype = "Manual",
+ title = meta$name,
+ author = authors,
+ year = year,
+ note = paste0("R package version ", meta$version),
+ url = meta$URL,
+ key = meta$identifier
+ )
Warning in bibentry(bibtype = "Manual", title = meta$name, author =
+authors, : Not all arguments are of the same length, the following need to
+be recycled: author
+cat(format(bibitem, "bibtex"))
@Manual{codemetar,
+ title = {codemetar: Generate CodeMeta Metadata for R Packages},
+ year = {2017},
+ note = {R package version 0.1.0},
+}
+bibitem
(2017). _codemetar: Generate CodeMeta Metadata for R Packages_. R
+package version 0.1.0.
+The ropensci corpus consists of a list of codemeta files for all packages provided by the rOpenSci project, <ropensci.org>. This provides a good test-case for how a large collection of codemeta files can be manipulated to help us get a better picture of the corpus.
+download.file("https://github.com/codemeta/codemetar/raw/master/inst/notebook/ropensci.json",
+ "ropensci.json")
As before, it is helpful, though not essential, to start off by framing the input data.
+frame <- system.file("schema/frame_schema.json", package="codemetar")
+
+corpus <-
+ jsonld_frame("ropensci.json", frame) %>%
+ fromJSON(simplifyVector = FALSE) %>%
+ getElement("@graph")
We’re now ready to start exploring. As usual, functions from purrr
prove very useful for iterating through large JSON files. First, we look at some basic summary data:
## deal with nulls explicitly by starting with map
+pkgs <- map(corpus, "name") %>% compact() %>% as.character()
+
+# keep only those with package identifiers (names)
+keep <- map_lgl(corpus, ~ length(.x$identifier) > 0)
+corpus <- corpus[keep]
+
+## now we can just do
+all_pkgs <- map_chr(corpus, "name")
+head(all_pkgs)
[1] "AntWeb: programmatic interface to the AntWeb"
+[2] "aRxiv: Interface to the arXiv API"
+[3] "chromer: Interface to Chromosome Counts Database API"
+[4] "ckanr: Client for the Comprehensive Knowledge Archive Network ('CKAN') 'API'"
+[5] "dashboard: A package status dashboard"
+[6] "ggit: Git Graphics"
+## 60 unique maintainers
+map_chr(corpus, c("maintainer", "familyName")) %>% unique() %>% length()
[1] 61
+## Mostly Scott
+map_chr(corpus, c("maintainer", "familyName")) %>%
+ as_tibble() %>%
+ group_by(value) %>%
+ tally(sort=TRUE)
value | +n | +
---|---|
Chamberlain | +105 | +
Ooms | +12 | +
Mullen | +8 | +
Ram | +8 | +
Boettiger | +6 | +
Salmon | +5 | +
FitzJohn | +4 | +
Hart | +2 | +
Leeper | +2 | +
Marwick | +2 | +
Müller | +2 | +
Padgham | +2 | +
South | +2 | +
Varela | +2 | +
Vitolo | +2 | +
Arnold | +1 | +
Attali | +1 | +
Banbury | +1 | +
Becker | +1 | +
Bengtsson | +1 | +
Braginsky | +1 | +
Broman | +1 | +
Bryan | +1 | +
Dallas | +1 | +
de Queiroz | +1 | +
Drost | +1 | +
Fischetti | +1 | +
Ghahraman | +1 | +
Goring | +1 | +
hackathoners | +1 | +
Harrison | +1 | +
Hughes | +1 | +
Jahn | +1 | +
Jones | +1 | +
Keyes | +1 | +
Krah | +1 | +
Lehtomaki | +1 | +
Lovelace | +1 | +
Lundstrom | +1 | +
McGlinn | +1 | +
McVey | +1 | +
Meissner | +1 | +
Michonneau | +1 | +
Moroz | +1 | +
Otegui | +1 | +
Pardo | +1 | +
Pennell | +1 | +
Poelen | +1 | +
Robinson | +1 | +
Ross | +1 | +
Rowlingson | +1 | +
Scott | +1 | +
Seers | +1 | +
Shotwell | +1 | +
Sievert | +1 | +
Sparks | +1 | +
Stachelek | +1 | +
Szöcs | +1 | +
Widgren | +1 | +
Wiggin | +1 | +
Winter | +1 | +
## number of co-authors ...
+map_int(corpus, function(r) length(r$author)) %>%
+ as_tibble() %>%
+ group_by(value) %>%
+ tally(sort=TRUE)
value | +n | +
---|---|
1 | +146 | +
2 | +30 | +
3 | +17 | +
4 | +8 | +
5 | +5 | +
7 | +3 | +
13 | +1 | +
## Contributors isn't used as much...
+map_int(corpus, function(r) length(r$contributor)) %>%
+ as_tibble() %>%
+ group_by(value) %>%
+ tally(sort=TRUE)
value | +n | +
---|---|
0 | +178 | +
2 | +13 | +
4 | +9 | +
3 | +7 | +
5 | +1 | +
6 | +1 | +
8 | +1 | +
Numbers (n) of packages with a total of (value) dependencies:
+map_int(corpus, function(r) length(r$softwareRequirements)) %>%
+ as_tibble() %>%
+ group_by(value) %>%
+ tally(sort=TRUE)
value | +n | +
---|---|
4 | +39 | +
5 | +35 | +
2 | +25 | +
3 | +25 | +
7 | +19 | +
6 | +16 | +
8 | +13 | +
9 | +8 | +
12 | +7 | +
10 | +6 | +
11 | +6 | +
13 | +3 | +
0 | +2 | +
14 | +1 | +
17 | +1 | +
18 | +1 | +
21 | +1 | +
22 | +1 | +
23 | +1 | +
which dependencies are used most frequently?
+corpus %>%
+map_df(function(x){
+ ## single, unboxed dep
+ if("name" %in% names(x$softwareRequirements))
+ dep <- x$name
+ else if("name" %in% names(x$softwareRequirements[[1]]))
+ dep <- map_chr(x$softwareRequirements, "name")
+ else { ## No requirementsß
+ dep <- NA
+ }
+
+ tibble(identifier = x$identifier, dep = dep)
+}) -> dep_df
+
+
+dep_df %>%
+group_by(dep) %>%
+ tally(sort = TRUE)
dep | +n | +
---|---|
jsonlite | +99 | +
httr | +92 | +
R | +66 | +
tibble | +46 | +
dplyr | +43 | +
methods | +37 | +
xml2 | +37 | +
data.table | +35 | +
utils | +35 | +
crul | +31 | +
plyr | +29 | +
XML | +25 | +
magrittr | +24 | +
sp | +22 | +
stringr | +21 | +
curl | +18 | +
ggplot2 | +18 | +
lazyeval | +17 | +
stats | +17 | +
lubridate | +14 | +
R6 | +14 | +
rappdirs | +13 | +
assertthat | +12 | +
digest | +12 | +
RCurl | +12 | +
readr | +11 | +
rgdal | +10 | +
whisker | +10 | +
scales | +9 | +
ape | +8 | +
raster | +8 | +
tidyr | +8 | +
Rcpp | +7 | +
reshape2 | +7 | +
rvest | +7 | +
rgeos | +6 | +
V8 | +6 | +
hoardr | +5 | +
rjson | +5 | +
taxize | +5 | +
tools | +5 | +
git2r | +4 | +
maps | +4 | +
oai | +4 | +
openssl | +4 | +
R(>=3.2.1) | +4 | +
solrium | +4 | +
urltools | +4 | +
foreach | +3 | +
knitr | +3 | +
leaflet | +3 | +
maptools | +3 | +
memoise | +3 | +
mime | +3 | +
pdftools | +3 | +
purrr | +3 | +
RColorBrewer | +3 | +
rgbif | +3 | +
rmarkdown | +3 | +
shiny | +3 | +
spocc | +3 | +
stringi | +3 | +
uuid | +3 | +
wicket | +3 | +
yaml | +3 | +
base64enc | +2 | +
bibtex | +2 | +
Biostrings | +2 | +
crayon | +2 | +
devtools | +2 | +
downloader | +2 | +
fauxpas | +2 | +
gdata | +2 | +
gistr | +2 | +
graphics | +2 | +
grid | +2 | +
htmltools | +2 | +
htmlwidgets | +2 | +
httpcode | +2 | +
igraph | +2 | +
jqr | +2 | +
MASS | +2 | +
miniUI | +2 | +
ncdf4 | +2 | +
png | +2 | +
R.cache | +2 | +
R.utils | +2 | +
rcrossref | +2 | +
rentrez | +2 | +
reshape | +2 | +
rmapshaper | +2 | +
rplos | +2 | +
rvertnet | +2 | +
shinyjs | +2 | +
storr | +2 | +
tm | +2 | +
NA | +2 | +
analogue | +1 | +
antiword: Extract Text from Microsoft Word Documents | +1 | +
apipkgen: Package Generator for HTTP API Wrapper Packages | +1 | +
appl: Approximate POMDP Planning Software | +1 | +
aRxiv | +1 | +
binman | +1 | +
Biobase | +1 | +
BiocGenerics | +1 | +
biomaRt | +1 | +
bold | +1 | +
caTools | +1 | +
ckanr | +1 | +
cld2: Google’s Compact Language Detector 2 | +1 | +
countrycode | +1 | +
cranlogs | +1 | +
crminer | +1 | +
crosstalk | +1 | +
DBI | +1 | +
dirdf: Extracts Metadata from Directory and File Names | +1 | +
doParallel | +1 | +
DT(>=0.1) | +1 | +
elastic | +1 | +
EML | +1 | +
fastmatch | +1 | +
foreign | +1 | +
functionMap | +1 | +
genderdata: Historical Datasets for Predicting Gender from Names | +1 | +
GenomeInfoDb | +1 | +
GenomicFeatures | +1 | +
GenomicRanges(>=1.23.24) | +1 | +
geoaxe | +1 | +
geojson | +1 | +
geojsonrewind: Fix ‘GeoJSON’ Winding Direction | +1 | +
geonames | +1 | +
geoops: ‘GeoJSON’ Manipulation Operations | +1 | +
geosphere | +1 | +
getPass | +1 | +
ggm | +1 | +
ggmap | +1 | +
ggthemes | +1 | +
graphql | +1 | +
grDevices | +1 | +
gridExtra | +1 | +
gtools | +1 | +
hash | +1 | +
hexbin | +1 | +
historydata: Data Sets for Historians | +1 | +
Hmisc | +1 | +
httpuv | +1 | +
IRanges | +1 | +
isdparser | +1 | +
jsonvalidate | +1 | +
jsonvalidate: Validate ‘JSON’ | +1 | +
leafletR | +1 | +
loggr | +1 | +
mapproj | +1 | +
markdown | +1 | +
Matrix | +1 | +
memisc | +1 | +
miniUI(>=0.1.1) | +1 | +
nabor | +1 | +
natserv | +1 | +
openxlsx | +1 | +
osmar | +1 | +
outliers | +1 | +
pdftools: Text Extraction and Rendering of PDF Documents | +1 | +
phytools | +1 | +
plotly | +1 | +
plumber | +1 | +
progress | +1 | +
protolite | +1 | +
qlcMatrix | +1 | +
RApiSerialize | +1 | +
rapport | +1 | +
rbhl | +1 | +
rbison | +1 | +
rebird | +1 | +
redland | +1 | +
redux | +1 | +
remotes | +1 | +
ridigbio | +1 | +
ritis | +1 | +
rJava | +1 | +
RJSONIO | +1 | +
rlist | +1 | +
Rmpfr | +1 | +
RMySQL | +1 | +
rncl | +1 | +
rnoaa | +1 | +
rnrfa | +1 | +
rotl | +1 | +
rowr | +1 | +
RPostgreSQL | +1 | +
rredis | +1 | +
rredlist | +1 | +
RSQLite | +1 | +
rstudioapi(>=0.5) | +1 | +
rtracklayer | +1 | +
rworldmap | +1 | +
rzmq: R Bindings for ZeroMQ | +1 | +
S4Vectors | +1 | +
scrapeR | +1 | +
selectr | +1 | +
sf | +1 | +
shiny(>=0.13.2) | +1 | +
snow | +1 | +
SnowballC | +1 | +
spatstat | +1 | +
SSOAP | +1 | +
stringdist | +1 | +
sys | +1 | +
tabulizerjars | +1 | +
testthat | +1 | +
tif: Text Interchange Format | +1 | +
USAboundariesData: Datasets for the ‘USAboundaries’ package | +1 | +
VariantAnnotation | +1 | +
viridisLite | +1 | +
wdman(>=0.2.2) | +1 | +
wellknown | +1 | +
wicket: Utilities to Handle WKT Spatial Data | +1 | +
WikidataR | +1 | +
wikitaxa | +1 | +
withr | +1 | +
worrms | +1 | +
xslt: XSLT 1.0 Transformations | +1 | +
zoo | +1 | +
Alternate approach using a frame instead of purrr
functions for subsetting the data. Note that this gets all Depends and suggests (really all SoftwareApplication
types mentioned)
dep_frame <- '{
+ "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
+ "@explicit": "true",
+ "name": {}
+}'
+jsonld_frame("ropensci.json", dep_frame) %>%
+ fromJSON() %>%
+ getElement("@graph") %>%
+ filter(type == "SoftwareApplication") %>%
+ group_by(name) %>%
+ tally(sort = TRUE)
name | +n | +
---|---|
testthat | +168 | +
knitr | +122 | +
jsonlite | +105 | +
httr | +96 | +
roxygen2 | +92 | +
R | +72 | +
rmarkdown | +68 | +
covr | +52 | +
dplyr | +49 | +
tibble | +48 | +
xml2 | +41 | +
methods | +38 | +
utils | +37 | +
data.table | +36 | +
ggplot2 | +36 | +
crul | +33 | +
plyr | +32 | +
magrittr | +28 | +
sp | +26 | +
XML | +25 | +
curl | +21 | +
stringr | +21 | +
lazyeval | +18 | +
stats | +18 | +
lubridate | +16 | +
R6 | +14 | +
readr | +14 | +
rgdal | +14 | +
rappdirs | +13 | +
assertthat | +12 | +
devtools | +12 | +
digest | +12 | +
raster | +12 | +
RCurl | +12 | +
scales | +12 | +
Rcpp | +11 | +
whisker | +11 | +
leaflet | +10 | +
rgeos | +10 | +
taxize | +10 | +
tidyr | +10 | +
reshape2 | +9 | +
ape | +8 | +
maps | +8 | +
V8 | +8 | +
maptools | +7 | +
purrr | +7 | +
rvest | +7 | +
pdftools | +6 | +
rgbif | +6 | +
shiny | +6 | +
ggmap | +5 | +
git2r | +5 | +
hoardr | +5 | +
ncdf4 | +5 | +
png | +5 | +
rjson | +5 | +
tools | +5 | +
oai | +4 | +
openssl | +4 | +
R(>=3.2.1) | +4 | +
rcrossref | +4 | +
RSQLite | +4 | +
sf | +4 | +
solrium | +4 | +
urltools | +4 | +
uuid | +4 | +
yaml | +4 | +
DBI | +3 | +
fauxpas | +3 | +
foreach | +3 | +
gdata | +3 | +
gistr | +3 | +
graphics | +3 | +
lintr | +3 | +
MASS | +3 | +
memoise | +3 | +
mime | +3 | +
miniUI | +3 | +
R.utils | +3 | +
RColorBrewer | +3 | +
rentrez | +3 | +
rmapshaper | +3 | +
rvertnet | +3 | +
rworldmap | +3 | +
spocc | +3 | +
stringi | +3 | +
wicket | +3 | +
base64enc | +2 | +
bibtex | +2 | +
Biostrings | +2 | +
broom | +2 | +
crayon | +2 | +
downloader | +2 | +
elastic | +2 | +
geiger | +2 | +
getPass | +2 | +
GGally | +2 | +
ggthemes | +2 | +
grDevices | +2 | +
grid | +2 | +
gridExtra | +2 | +
htmltools | +2 | +
htmlwidgets | +2 | +
httpcode | +2 | +
igraph | +2 | +
jqr | +2 | +
jsonvalidate | +2 | +
listviewer | +2 | +
mapproj | +2 | +
Matrix | +2 | +
phylobase | +2 | +
phytools | +2 | +
R.cache | +2 | +
RcppRedis | +2 | +
readxl | +2 | +
remotes | +2 | +
reshape | +2 | +
rplos | +2 | +
shinyjs | +2 | +
storr | +2 | +
sys | +2 | +
tm | +2 | +
viridis | +2 | +
webp | +2 | +
zoo | +2 | +
akima | +1 | +
analogue | +1 | +
aRxiv | +1 | +
binman | +1 | +
Biobase | +1 | +
BiocGenerics | +1 | +
biomaRt | +1 | +
bold | +1 | +
Cairo | +1 | +
caTools | +1 | +
ckanr | +1 | +
corrplot | +1 | +
countrycode | +1 | +
cranlogs | +1 | +
crminer | +1 | +
crosstalk | +1 | +
dendextend | +1 | +
doParallel | +1 | +
dplyr(>=0.3.0.2) | +1 | +
DT(>=0.1) | +1 | +
EML | +1 | +
etseed | +1 | +
fastmatch | +1 | +
fields | +1 | +
forecast | +1 | +
foreign | +1 | +
fulltext | +1 | +
functionMap | +1 | +
genderdata | +1 | +
GenomeInfoDb | +1 | +
GenomicFeatures | +1 | +
GenomicRanges(>=1.23.24) | +1 | +
geoaxe | +1 | +
geojson | +1 | +
geojsonio | +1 | +
geojsonlint | +1 | +
geonames | +1 | +
geosphere | +1 | +
ggalt | +1 | +
ggm | +1 | +
graphql | +1 | +
GSODR | +1 | +
gtools | +1 | +
hash | +1 | +
hexbin | +1 | +
historydata | +1 | +
Hmisc | +1 | +
httpuv | +1 | +
IRanges | +1 | +
IRdisplay | +1 | +
isdparser | +1 | +
janeaustenr | +1 | +
jpeg | +1 | +
knitcitations | +1 | +
leafletR | +1 | +
loggr | +1 | +
magick | +1 | +
mapdata | +1 | +
markdown | +1 | +
MCMCglmm | +1 | +
memisc | +1 | +
miniUI(>=0.1.1) | +1 | +
mongolite | +1 | +
nabor | +1 | +
natserv | +1 | +
openair | +1 | +
openxlsx | +1 | +
osmar | +1 | +
outliers | +1 | +
pander | +1 | +
parallel | +1 | +
plot3D | +1 | +
plotKML | +1 | +
plotly | +1 | +
plumber | +1 | +
progress | +1 | +
protolite | +1 | +
purrrlyr | +1 | +
qlcMatrix | +1 | +
RApiSerialize | +1 | +
rapport | +1 | +
rbhl | +1 | +
rbison | +1 | +
rcdk | +1 | +
Rcompression | +1 | +
readtext | +1 | +
rebird | +1 | +
RedisAPI | +1 | +
redland | +1 | +
redux | +1 | +
reeack | +1 | +
rfigshare | +1 | +
ridigbio | +1 | +
rinat | +1 | +
ritis | +1 | +
rJava | +1 | +
RJSONIO | +1 | +
rlist | +1 | +
Rmpfr | +1 | +
RMySQL | +1 | +
rnaturalearthdata | +1 | +
rnaturalearthhires | +1 | +
rncl | +1 | +
RNeXML | +1 | +
rnoaa | +1 | +
rnrfa | +1 | +
ropenaq | +1 | +
rotl | +1 | +
rowr | +1 | +
RPostgreSQL | +1 | +
rrdf | +1 | +
rredis | +1 | +
rredlist | +1 | +
rrlite | +1 | +
RSclient | +1 | +
RSelenium | +1 | +
Rserve | +1 | +
rstudioapi(>=0.5) | +1 | +
rsvg | +1 | +
rtracklayer | +1 | +
RUnit | +1 | +
S4Vectors | +1 | +
sangerseqR | +1 | +
scrapeR | +1 | +
selectr | +1 | +
seqinr | +1 | +
shiny(>=0.13.2) | +1 | +
snow | +1 | +
SnowballC | +1 | +
sofa | +1 | +
spacetime | +1 | +
spatstat | +1 | +
SSOAP | +1 | +
stringdist | +1 | +
Suggests:testthat | +1 | +
Sxslt | +1 | +
tabulizerjars | +1 | +
testthat(>=0.7) | +1 | +
tidytext | +1 | +
tidyverse | +1 | +
tiff | +1 | +
tmap | +1 | +
USAboundaries | +1 | +
USAboundariesData | +1 | +
VariantAnnotation | +1 | +
vegan | +1 | +
viridisLite | +1 | +
wdman(>=0.2.2) | +1 | +
weathermetrics | +1 | +
webmockr | +1 | +
webshot | +1 | +
wellknown | +1 | +
WikidataR | +1 | +
wikitaxa | +1 | +
withr | +1 | +
wordcloud2 | +1 | +
worrms | +1 | +
XMLSchema | +1 | +
xtable | +1 | +
xts | +1 | +
# summarise(count(name))
Schema validation is a useful and important concept to the distribution of metadata in formats such as XML and JSON, in which the standard-provider creates a schema (specified in an XML-schema, XSD, for XML documents, or json-schema for JSON documents). Schemas allow us to go beyond the basic notation of making sure a file is simply valid XML or valid JSON, a requriement just to be read in by any parser. By detailing how the metadata must be structured, what elements must, can, and may not be included, and what data types may be used for those elements, schema help developers consuming the data to anticipate these details and thus build applications which know how to process them. For the data creator, validation is a convenient way to catch data input errors and ensure a consistent data structure.
+Schema validation is a useful and important concept to the distribution of metadata in formats such as XML and JSON, in which the standard-provider creates a schema (specified in an XML-schema, XSD, for XML documents, or json-schema for JSON documents). Schemas allow us to go beyond the basic notation of making sure a file is simply valid XML or valid JSON, a requriement just to be read in by any parser. By detailing how the metadata must be structured, what elements must, can, and may not be included, and what data types may be used for those elements, schema help developers consuming the data to anticipate these details and thus build applications which know how to process them. For the data creator, validation is a convenient way to catch data input errors and ensure a consistent data structure.
Because schema validation must ensure predictable behavior without knowledge of what any specific application is going to do with the data, it tends to be very strict. A simple application may not care if certain fields are missing or if integers are mistaken for characters, while to another application these differences could lead it to throw fatal errors.
-The approach of JSON-LD is less perscriptive. JSON-LD uses the notion of “framing” to let each application specify how it expects it data to be structured.JSON frames allow each developer consuming the data to handle many of the same issues that schema validation have previously assured.
+The approach of JSON-LD is less perscriptive. JSON-LD uses the notion of “framing” to let each application specify how it expects it data to be structured. JSON frames allow each developer consuming the data to handle many of the same issues that schema validation have previously assured. Readers should consult the official json-ld framing documentation for details on this approach.
library(jsonld)
library(jsonlite)
library(magrittr)
@@ -129,11 +132,11 @@
family = author$family,
email = author$email,
role = "aut"))
## [[1]]
-## [1] "Carl Boettiger <cboettig@gmail.com> [aut]"
+[[1]]
+[1] "Carl Boettiger <cboettig@gmail.com> [aut]"
Yay, that works as expected, since our metadata had all the fields we needed. However, there’s other data that is missing in our example that could potentially cause problems for our application. For instance, our first author
lists no affiliation, so the following code throws an error:
meta$author[[1]]$affiliation
## NULL
+NULL
If we’re processing a lot of codemeta.json
and only one input file is missing the affilation, it could disrupt our whole process. If codemeta.json
were perscribed be a JSON schema, we could insist in the schema that affilation
could not be missing. But that feels a bit heavy-handed – many use cases may have no need for affilation
. (Of course one we could just leave this problem for each developer to address explicitly with their own error handling logic, but no developer would like that).
## [1] "Boettiger"
+[1] "Boettiger"
meta$author[[1]]$affiliation
## NULL
+NULL
## $id
-## [1] "http://orcid.org/0000-0002-1642-628X"
-##
-## $type
-## [1] "Person"
-##
-## $familyName
-## [1] "Boettiger"
-##
-## $givenName
-## [1] "Carl"
+$id
+[1] "http://orcid.org/0000-0002-1642-628X"
+
+$type
+[1] "Person"
+
+$familyName
+[1] "Boettiger"
+
+$givenName
+[1] "Carl"
Note that this has only returned the requested fields in the graph (along with the @id
and @type
, which are always included if provided, since they may be required to interpret the data properly). This frame extracts the givenName
and familyName
of any Person
node it finds, regardless of where it occurs, while ommitting the rest of the data. Note that since the frame requests these elements at the top level, they are returned as such, with each match a separate entry in the @graph
. Our example has only one person in meta[[1]]
, had we more matches they would appear in meta[[2]]
, etc. Note these returns are un-ordered.
The same underlying data can often be expressed in different ways, particularly when dealing with nested data. Framing can be of great help here to reshape the data into the structure required by the application. For instance, it would be natural to access the email
of the maintainer
in the same manner we did the author, but this fails for our example as maintainer
is defined only by reference to an ID:
meta <- fromJSON(codemeta, simplifyVector = FALSE)
paste("For complaints, email", meta$maintainer$email)
## [1] "For complaints, email "
+[1] "For complaints, email "
We can confirm that maintainer
is just an ID:
meta$maintainer
## $`@id`
-## [1] "http://orcid.org/0000-0002-1642-628X"
+$`@id`
+[1] "http://orcid.org/0000-0002-1642-628X"
We can use a frame with the special directive "@embed": "@always"
to say that we want the full maintainer information embedded an not just referred to by id alone. Then we can subset maintainer
just like we do author.
frame <- '{
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
@@ -213,7 +216,7 @@
getElement("@graph") %>% getElement(1)
Now we can do
paste("For complaints, email", meta$maintainer$email)
## [1] "For complaints, email cboettig@gmail.com"
+[1] "For complaints, email cboettig@gmail.com"
and see that email
has been successfully returned from the matching ID under author data.
## NULL
+NULL
We just get NULL
, rather than some unexpected type of object (e.g. a string that is not a URL.) Note that the data is not lost, but simply not dereferenced:
names(meta)
## [1] "id" "type"
-## [3] "name" "codemeta:buildInstructions"
+[1] "id" "type"
+[3] "name" "codemeta:buildInstructions"
meta["codemeta:buildInstructions"]
## $`codemeta:buildInstructions`
-## $`codemeta:buildInstructions`$type
-## [1] "Text"
-##
-## $`codemeta:buildInstructions`$`@value`
-## [1] "Just install this package using devtools::install_github"
+$`codemeta:buildInstructions`
+$`codemeta:buildInstructions`$type
+[1] "Text"
+
+$`codemeta:buildInstructions`$`@value`
+[1] "Just install this package using devtools::install_github"
Note that this behavior only happens because the data declared the "@type": "Text"
explicitly. JSON-LD algorithms only believe what they are told about type and only look for consistency in declared types. If you give text but declare it as a "@type": "URL"
, or don’t declare the type at all, JSON-LD algorithms won’t know anything is amiss and the property will be compacted as usual.