From e872c9dc8ed202e2253b65ca6e0872896bf50c76 Mon Sep 17 00:00:00 2001 From: Ramiro Magno Date: Thu, 4 Jul 2024 23:33:31 +0100 Subject: [PATCH] Add Representative Sequence article --- R/report_source.R | 10 +++- README.Rmd | 9 +++- README.md | 17 ++++++- _pkgdown.yml | 2 + man/report_last_modified.Rd | 8 +++- vignettes/articles/biotype_conflicts.Rmd | 8 ++-- .../articles/representative_sequence.Rmd | 48 +++++++++++++++++++ 7 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 vignettes/articles/representative_sequence.Rmd diff --git a/R/report_source.R b/R/report_source.R index 2d7aa8f..c5d8d54 100644 --- a/R/report_source.R +++ b/R/report_source.R @@ -42,8 +42,16 @@ find_report_last_modified <- function(file) { #' Report last modification date #' +#' @description +#' #' [report_last_modified()] returns the last modified date and time of the -#' report source (local file or remote file). +#' report source: local file or remote file. If a local file, the modification +#' date will be that indicated by the file system; if a remote file, the date +#' of last update is that provided by HTTP header `"last-modified"`. +#' +#' MGI updates its reports weekly, every Thursday. However, not all reports are +#' updated each week. The return value of this function is the closest you will +#' get to a versioning of MGI report files. #' #' @param tbl Report data as a [tibble][tibble::tibble-package]. #' diff --git a/README.Rmd b/README.Rmd index e5b79f8..c76378b 100644 --- a/README.Rmd +++ b/README.Rmd @@ -53,7 +53,11 @@ Use `read_report()` to read any supported MGI report into R, e.g. to read `MRK_List1.rpt`: ```{r} -read_report("marker_list1", n_max = 10L) +(markers <- read_report("marker_list1", n_max = 10L)) +# Report file source +report_source(markers) +# Report file last modification date +report_last_modified(markers) ``` ## Code of Conduct @@ -72,6 +76,9 @@ package thoroughly before relying on it in critical applications. The authors disclaim all liability for any damage or loss resulting from the use of this package. Use of the `{mgi.report.reader}` package is at the user's own risk. +Support for reports is an ongoing process, but we welcome pull requests for +quicker coverage. + ## Citing this package - Firstly, if you use this package please do not forget to start by citing the diff --git a/README.md b/README.md index dd99249..ba819b8 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Use `read_report()` to read any supported MGI report into R, e.g. to read `MRK_List1.rpt`: ``` r -read_report("marker_list1", n_max = 10L) +(markers <- read_report("marker_list1", n_max = 10L)) #> # A tibble: 10 × 15 #> marker_status marker_type marker_id marker_symbol marker_name feature_type #> @@ -79,6 +79,18 @@ read_report("marker_list1", n_max = 10L) #> # marker_symbol_now , note ``` +``` r +# Report file source +report_source(markers) +#> [1] "https://www.informatics.jax.org/downloads/reports/MRK_List1.rpt" +``` + +``` r +# Report file last modification date +report_last_modified(markers) +#> [1] "2024-07-01 11:51:02 GMT" +``` + ## Code of Conduct Please note that the `{mgi.report.reader}` project is released with a @@ -97,6 +109,9 @@ before relying on it in critical applications. The authors disclaim all liability for any damage or loss resulting from the use of this package. Use of the `{mgi.report.reader}` package is at the user’s own risk. +Support for reports is an ongoing process, but we welcome pull requests +for quicker coverage. + ## Citing this package - Firstly, if you use this package please do not forget to start by diff --git a/_pkgdown.yml b/_pkgdown.yml index 2b342cc..0d2a867 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -14,6 +14,8 @@ navbar: menu: - text: Genetic Marker href: articles/genetic_marker.html + - text: Representative Genomic Sequence + href: articles/representative_sequence.html articles: text: Reports menu: diff --git a/man/report_last_modified.Rd b/man/report_last_modified.Rd index b221a0d..bb7b553 100644 --- a/man/report_last_modified.Rd +++ b/man/report_last_modified.Rd @@ -15,7 +15,13 @@ object. } \description{ \code{\link[=report_last_modified]{report_last_modified()}} returns the last modified date and time of the -report source (local file or remote file). +report source: local file or remote file. If a local file, the modification +date will be that indicated by the file system; if a remote file, the date +of last update is that provided by HTTP header \code{"last-modified"}. + +MGI updates its reports weekly, every Thursday. However, not all reports are +updated each week. The return value of this function is the closest you will +get to a versioning of MGI report files. } \examples{ markers <- read_report("marker_list1", n_max = 10L) diff --git a/vignettes/articles/biotype_conflicts.Rmd b/vignettes/articles/biotype_conflicts.Rmd index 5d12463..ddaac5a 100644 --- a/vignettes/articles/biotype_conflicts.Rmd +++ b/vignettes/articles/biotype_conflicts.Rmd @@ -23,9 +23,11 @@ source of the classification is indicated in the `database` variable. The ## MGI Representative Gene Model -The variable `is_mgi_rep` stands for _is MGI representative_ and is encoded as a -logical vector that indicates whether the corresponding `gene_id` and `biotype` -values are the ones adopted by MGI as representative for the genetic marker. +The variable `is_mgi_rep` stands for _is the MGI genomic representative +sequence_ and is encoded as a logical vector that indicates whether the +corresponding `gene_id` and `biotype` values are the ones associated with MGI +representative sequence. See `vignette("representative_sequence")` for more +details. ```{r} biotype_conflicts |> diff --git a/vignettes/articles/representative_sequence.Rmd b/vignettes/articles/representative_sequence.Rmd new file mode 100644 index 0000000..0e91ed7 --- /dev/null +++ b/vignettes/articles/representative_sequence.Rmd @@ -0,0 +1,48 @@ +--- +title: "Representative Genomic Sequence" +--- + +```{r setup, echo=FALSE} +library(mgi.report.reader) +``` + +In MGI, selecting a representative genome sequence is crucial, as it influences the +representative transcript and protein sequences. Priorities for selecting +representative genomic sequences include gene model sequences from Ensembl, +NCBI, and VISTA annotations. In MGI, gene model sequences define the genomic +region using `start` and `end` coordinates from providers (`source`), including +regions defined by regulatory feature providers. + +## MGI representative genomic sequence + +For both protein-coding and noncoding RNA genes and pseudogenes, the +representative genomic sequence is typically chosen from Ensembl or NCBI gene +models. If both providers (`source`) offer gene models for a feature, the +shorter model is selected to avoid extended read-through transcripts. In the +absence of gene models, the longest associated GenBank genomic sequence is +chosen. For regulatory regions, the gene model from Ensembl, NCBI, or VISTA is +selected, with NCBI models preferred for enhancers when available. + +Whether a sequence is considered representative is indicated by the variable +`is_mgi_rep`. For example, in the MGI_BioTypeConflict.rpt report, this can be +referenced by reviewing the `vignette("biotype_conflicts")`. + +## MGI representative transcript and protein sequences + +Representative transcript and protein sequences are selected algorithmically +based on the representative genomic sequence. If the genomic sequence is from +Ensembl, the longest Ensembl protein and corresponding transcript are chosen. If +it is not from Ensembl, the longest transcript from the genomic gene model +provider is selected, and, if coding, the longest associated protein from a +provider hierarchy is chosen. If the representative genomic sequence is not a +gene model from an annotation provider, both transcript and protein sequences +(if coding) are selected from provider (`source`) hierarchies: + +- **Transcript hierarchy**: Longest of NM RefSeq > NR RefSeq > GenBank non-EST RNA > +XM RefSeq > XR RefSeq > GenBank EST RNA. + +- **Protein hierarchy**: Longest of SWISS-PROT > RefSeq NP > TrEMBL > RefSeq XP. + +## References + +- Richard M Baldarelli, Cynthia L Smith, Martin Ringwald, Joel E Richardson, Carol J Bult, Mouse Genome Informatics Group , Mouse Genome Informatics: an integrated knowledgebase system for the laboratory mouse, Genetics, Volume 227, Issue 1, May 2024, iyae031. [doi:10.1093/genetics/iyae031](https://doi.org/10.1093/genetics/iyae031).