diff --git a/paper/images/REDCapTidieR JOSS - Superheroes.png b/paper/images/REDCapTidieR JOSS - Superheroes.png new file mode 100644 index 00000000..f8a71f60 Binary files /dev/null and b/paper/images/REDCapTidieR JOSS - Superheroes.png differ diff --git a/paper/paper.bib b/paper/paper.bib index 593f11ec..41a7f663 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -40,7 +40,7 @@ @article{Wickham2014 pages={1–23} } -@Manual{r_citation, +@Manual{r_cit, title = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, organization = {R Foundation for Statistical Computing}, @@ -65,6 +65,22 @@ @Manual{redcapapi_cit url = {https://github.com/nutterb/redcapAPI/wiki}, } +@Manual{redcapdm_cit, + title = {REDCapDM: 'REDCap' Data Management}, + author = {João Carmezim and Judith Peñafiel and Pau Satorra and Esther García and Natàlia Pallarés and Cristian Tebé}, + year = {2023}, + note = {R package version 0.8.0}, + url = {https://ubidi.github.io/REDCapDM/}, +} + +@Manual{tidyredcap_cit, + title = {tidyREDCap: Helper Functions for Working with 'REDCap' Data}, + author = {Raymond Balise and Gabriel Odom and Anna Calderon and Layla Bouzoubaa and Wayne DeFreitas and Kyle Grealis}, + year = {2023}, + note = {R package version 1.1.1}, + url = {https://raymondbalise.github.io/tidyREDCap/index.html}, +} + @Manual{labelled_cit, title = {labelled: Manipulating Labelled Data}, author = {Joseph Larmarange}, @@ -99,6 +115,13 @@ @Article{tidyverse_cit doi = {10.21105/joss.01686}, } +@Manual{tibble_cit, + title = {tibble: Simple Data Frames}, + author = {Kirill Müller and Hadley Wickham}, + year = {2023}, + note = {https://tibble.tidyverse.org/, https://github.com/tidyverse/tibble}, +} + @Misc{openssf_cit, title = {OpenOpen Source Security Foundation_2023}, url={https://openssf.org/}, diff --git a/paper/paper.md b/paper/paper.md index 781b6238..50379f22 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -50,17 +50,17 @@ bibliography: paper.bib Capturing and storing electronic data is integral in the research world, yet often becomes a burden to the researchers themselves. [REDCap](https://www.project-redcap.org/) [@Harris2009; @Harris2019] alleviates this problem by offering a secure web application that lets users build databases and surveys with a robust front-end interface that can support data of any type, including data requiring compliance with standards for protected information. -For many researchers who use REDCap, the R language [@r_citation] is a powerful tool for extracting and analyzing their data. To take advantage of REDCap's REST API, the [`REDCapR`](https://cran.r-project.org/web/packages/REDCapR/index.html) [@redcapr_cit] and [`redcapAPI`](https://cran.r-project.org/web/packages/redcapAPI/index.html) [@redcapapi_cit] packages allow R users to extract data directly into their programming environment. The default extraction structure for a given REDCap database is referred to as the "block matrix," and is a singular, unwieldy, and "untidy" data table. The concept of "[tidy data](https://www.jstatsoft.org/article/view/v059i10)" [@Wickham2014] describes a framework for standard mapping and structuring of data where each variable forms a column, each observation forms a row, and each type of observational unit forms a table. Fundamentally, the block matrix breaks these tidy principles by obscuring the primary keys that identify individual records, leaving analysts with the arduous task of reformatting the matrix for usability. +For many researchers who use REDCap, the R programming language [@r_cit] is a powerful tool for extracting and analyzing their data. To take advantage of REDCap's REST API, the [`REDCapR`](https://cran.r-project.org/web/packages/REDCapR/index.html) [@redcapr_cit] and [`redcapAPI`](https://cran.r-project.org/web/packages/redcapAPI/index.html) [@redcapapi_cit] packages allow R users to extract data directly into their programming environment. The default extraction structure for a given REDCap database is referred to as the "block matrix," and is a singular, unwieldy, and "untidy" data table. The concept of "[tidy data](https://www.jstatsoft.org/article/view/v059i10)" [@Wickham2014] describes a framework for standard mapping and structuring of data where each variable forms a column, each observation forms a row, and each type of observational unit forms a table. The block matrix structure breaks these tidy principles by obscuring the primary keys that identify individual records, leaving analysts with the arduous task of reformatting the matrix for usability. To address these challenges, we developed `REDCapTidieR` as an open source R package that transforms the standard REDCap output into a format that adheres to tidy data principles. `REDCapTidieR` has the potential to save organizations and research staff immeasurable amounts of time, allowing them to quickly query their data without the need for intricate data parsing processes. # Statement of Need -As of 2023, the REDCap Consortium boasts nearly 3 million users across over 150 countries. REDCap databases exhibit significant variation in complexity, ranging from simple tables with easily identifiable records to more challenging scenarios where pinpointing a unique identifier is harder. This complexity often arises in databases that make use of "repeating instruments" and "repeating events." These concepts are explored in depth in the [`REDCapTidieR` documentation](https://chop-cgtinformatics.github.io/REDCapTidieR/articles/diving_deeper.html#longitudinal-redcap-projects), but fundamentally repeating events and instruments support longitudinal studies where subjects may have distinct timelines with varying levels of record granularity. Repeating instruments and events are unavoidable for most clinical trial studies and reformatting the data that belongs to them from the flattened block matrix can be a major pain point for analysts. +As of 2023, the REDCap Consortium boasts nearly 3 million users across more than 150 countries. REDCap databases exhibit significant variation in complexity, ranging from simple tables with easily identifiable records to more comprehensive builds where pinpointing a unique identifier is harder. This complexity often arises in databases that make use of "repeating instruments" and "repeating events." These concepts are explored in depth in the [`REDCapTidieR` documentation](https://chop-cgtinformatics.github.io/REDCapTidieR/articles/diving_deeper.html#longitudinal-redcap-projects), but simply put repeating events and instruments support longitudinal studies where subjects may have distinct timelines with varying levels of record granularity. Repeating instruments and events are unavoidable for most clinical trial studies and reformatting the data that belongs to them from the block matrix can be a major pain point for analysts. -While there are a few existing REDCap tools for R documented by [`REDCap-tools`](https://redcap-tools.github.io/projects/), `REDCapTidieR` occupies a unique space by providing analysts with an opinionated framework that quickly prepares them for data analysis. Although some of the aforementioned tools also offer functions for data processing, such as the [`tidyREDCap`](https://raymondbalise.github.io/tidyREDCap/) and [`REDCapDM`](https://ubidi.github.io/REDCapDM/index.html) packages, `REDCapTidieR` is unique in how it restructures the block matrix into a format that is easily interpretable within the user's programmatic environment. Of the tools available, `REDCapTidieR` is the only one that fundamentally restructures the block matrix in its entirety and subsequently the only one that gives a definitive tidy solution to the problem of repeating instruments and events. +While there are a few existing REDCap tools for R documented by [`REDCap-tools`](https://redcap-tools.github.io/projects/), `REDCapTidieR` occupies a unique space by providing analysts with an opinionated framework that quickly returns a tidy data structure regardless of the size or complexity of the extracted database. Although some of these tools also offer functions for data processing, such as the [`tidyREDCap`](https://raymondbalise.github.io/tidyREDCap/) [@tidyredcap_cit] and [`REDCapDM`](https://ubidi.github.io/REDCapDM/index.html) [@redcapdm_cit] packages, `REDCapTidieR` is unique in how it restructures the block matrix into an easily interpretable format within the user's programmatic environment. Of the tools available, `REDCapTidieR` is the only one that fundamentally restructures the block matrix in its entirety and subsequently the only one that gives a definitive tidy solution to the problem of repeating instruments and events. -REDCapTidieR was developed with deployment in production environments as a key consideration. To ensure the utmost confidence in the handling of user data, we've implemented an extensive test suite that exhibits 98% code coverage, as of the package's version 1.0 release. Ample documentation is accessible through a collection of package vignettes and articles, offering detailed insights into the opinionated framework, design structure, and a comprehensive glossary of terms associated with the REDCapTidieR package. These considerations have earned the package an [OpenSSF Best Practices certification](https://www.bestpractices.dev/en/projects/6845) [@openssf_cit], which certifies open source projects that meet stringent criteria for delivering high-quality and secure software. +REDCapTidieR was developed with production environment deployment as a key consideration. To ensure package stability, we've implemented an extensive test suite that exhibits 98% code coverage as of the package's version 1.0 release. Ample documentation is accessible through a collection of package vignettes and articles, offering detailed insights into the opinionated framework, design structure, and a comprehensive glossary of terms associated with the `REDCapTidieR` package. The package was also developed in alignment with the [OpenSSF Best Practices Badging program](https://www.bestpractices.dev/en/projects/6845) [@openssf_cit], certifying open source projects adhering to criteria for delivering high-quality, secure software. | Package | Data Export Support | Data Import Support | Data Manipulation | Tidy Reformatting | Production Ready | |-------------|---------------------|---------------------|-------------------|-------------------| ---------------- | @@ -74,14 +74,14 @@ Table 1: Comparative breakdown of the landscape for REDCap tools in R. # Design -Transformation of the block matrix into a friendlier structure is carried out by `REDCapTidieR` through a series of complex operations that result in the "supertibble." The supertibble, named after the [`tibble` package](https://tibble.tidyverse.org/), is presented as a table where each row corresponds to a REDCap instrument and each column corresponds to either that instrument's post-processed data (a "data tibble"), metadata, or useful information about that instrument itself. +Transformation of the block matrix into a friendlier structure is carried out by `REDCapTidieR` through a series of complex operations that result in the "supertibble." The supertibble, named after the [`tibble` package](https://tibble.tidyverse.org/) [@tibble_cit], is presented as a table where each row corresponds to a REDCap instrument and each column corresponds to either that instrument's post-processed data (a nested "data tibble"), metadata, or useful information about that instrument itself. -Unlike the block matrix, which combines all columns for record identification into one table, `REDCapTidieR` separates instruments so that only the variables necessary for identification of a record within the instrument are included in each data tibble. Below we provide a sample model that compares the standard output from a REDCap database with non-repeating and repeating instruments to one post-processed through `REDCapTidieR`. +Unlike the block matrix, which combines all columns for record identification into one table, `REDCapTidieR` separates instruments into individual data tibbles so that only the variables necessary for identification of a record within the instrument are included. Below, we provide a model that compares the standard output from a REDCap database with non-repeating and repeating instruments to one post-processed through `REDCapTidieR` using a sample of the open source [Superhero Database](https://www.superherodb.com/). -![Conceptual Model](/paper/images/REDCapTidieR%20JOSS.png) -Figure 1: Comparative model showing REDCap API export formats between the default behavior and `REDCapTidieR`. +![Conceptual Model](/paper/images/REDCapTidieR%20JOSS%20-%20Superheroes.png) +Figure 1: Comparative model showing REDCap API export formats between the default behavior and `REDCapTidieR` -In this example, the supertibble displays three REDCap database instruments, with one repeating and two non-repeating. Below, one of each of these instrument types is expanded to show how `REDCapTidieR` separates these instruments into their own tabular list elements structured with only the identifiers necessary to pinpoint a specific record. This format makes tables easily joinable by analysts for whatever operations they may need later in their work. +In this example, the supertibble displays two REDCap database instruments, with demographic heroes information as non-repeating and corresponding hero powers as repeating. Below, one of each of these instrument types is expanded to show how `REDCapTidieR` separates these instruments into their own tabular list elements structured with only the identifiers necessary to pinpoint a specific record. This format makes tables easily joinable by analysts for whatever operations they may need later in their work. Additionally, REDCapTidieR comes equipped with features that address common requirements of analysts. Seamless integration with the `labelled` [@labelled_cit] package facilitates effortless application of variable labels to both data and metadata. An extension utilizing the `skimr` [@skimr_cit] package provides comprehensive metric summaries of metadata for exported REDCap databases. Lastly, through an extension leveraging the `openxlsx2` [@openxlsx2_cit] package, users can easily export REDCapTidieR data tibbles to individual XLSX sheets.