From 68b15de0e8803152816c3887224b6c43cdfa405f Mon Sep 17 00:00:00 2001 From: "Tim Triche, Jr." Date: Sun, 21 Jul 2024 16:14:34 -0400 Subject: [PATCH 1/4] Add verification step for raw GEO data vs. prepared CSV Show that the CSV file provided does in fact contain the same information as the raw data files deposited on GEO for each GSM. --- episodes/02-setup.Rmd | 64 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/episodes/02-setup.Rmd b/episodes/02-setup.Rmd index 5e78ce78..ac278b34 100644 --- a/episodes/02-setup.Rmd +++ b/episodes/02-setup.Rmd @@ -185,6 +185,70 @@ download.file( ) ``` + +::::::::::::::::::::::::::::::::::::::: challenge + +## Check that the data file matches the raw data on GEO + +This is a substantial challenge. You don't need to do it! But you needn't trust us, either. +You can verify that the raw data deposited on GEO is directly comparable to the prepared CSV. +To look up raw data for samples (GSMxxxx) on GEO, we can use the [GEOquery](https://doi.org/10.1093/bioinformatics/btm254) package. +The `GEOquery::getGEOSuppFiles()` function then allows one to download or read the raw data files in question. +The tricky part of this is writing a function to extract the data of interest from the raw data files on GEO. +Once we have that function, we can check that the provided CSV file contains the same counts as the raw data. + +```{r check-geo-data} + +# Below, we read the summarized data file into a data.frame and inspect its dimensions (how many rows, how many columns). +counts_URL <- "https://github.com/carpentries-incubator/bioc-rnaseq/raw/main/episodes/data/GSE96870_counts_cerebellum.csv" +counts_cerebellum <- read.csv(url(counts_URL), row.names=1) +dim(counts_cerebellum) +# [1] 41786 22 + +# Each column in the data.frame represents one of 22 GSMs (GEO Samples). +GSMs_cerebellum <- colnames(counts_cerebellum) + +# GEOquery can look up the raw data locations for these GSMs. +library(GEOquery) +GSM_URLs <- sapply(GSMs_cerebellum, + function(GSM) getGEOSuppFiles(GSM, fetch=FALSE)[1, "url"]) + +# To extract the read counts per gene for each sample, we write a small function. +fetch_featcounts <- function(GSM_URL) { + message("Processing ", GSM_URL) + tsv <- readLines(gzcon(url(GSM_URL)))[-1][-1] # drop header and colnames + genes <- sapply(strsplit(tsv, "\t"), `[[`, 1) + featcounts <- as.integer(sapply(strsplit(tsv, "\t"), `[[`, 7)) + names(featcounts) <- genes + return(featcounts) +} + +# We can use do.call() and cbind() to make each result a column of read counts: +counts_GSMs <- data.frame(do.call(cbind, lapply(GSM_URLs, fetch_featcounts))) +dim(counts_GSMs) +# [1] 41786 22 + +``` + +Above we demonstrate that the numbers of genes and samples are the same. +However, that does not demonstrate that the read counts are the same. +Is there a function in R that will check whether two objects are identical? + +::::::::::::::::::::::::::::::::::::: solution + +The `identical()` function verifies that two objects in R are the same, save for their name. + +```{r check-geo-data-solution} +stopifnot(identical(dim(counts_cerebellum), dim(counts_GSMs))) +identical(counts_cerebellum, counts_GSMs) +# [1] TRUE +``` + +The prepared data file does indeed contain the same counts per gene as the raw GEO data files. + +::::::::::::::::::::::::::::::::::::: + + If you navigate to the `data` folder in your working directory, you should now find a file named `GSE96870_counts_cerebellum.csv`. From 0d29cd036118d089264aa35a6ee55e7e0784e752 Mon Sep 17 00:00:00 2001 From: "Tim Triche, Jr." Date: Sun, 21 Jul 2024 16:20:50 -0400 Subject: [PATCH 2/4] Tidy up a little bit One needs to use "[[" rather than `[[` in webR (I know not why). Also, breaking up the challenge solution with comments is more considerate of the learner's attention. --- episodes/02-setup.Rmd | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/episodes/02-setup.Rmd b/episodes/02-setup.Rmd index ac278b34..afdddd6c 100644 --- a/episodes/02-setup.Rmd +++ b/episodes/02-setup.Rmd @@ -217,13 +217,13 @@ GSM_URLs <- sapply(GSMs_cerebellum, fetch_featcounts <- function(GSM_URL) { message("Processing ", GSM_URL) tsv <- readLines(gzcon(url(GSM_URL)))[-1][-1] # drop header and colnames - genes <- sapply(strsplit(tsv, "\t"), `[[`, 1) - featcounts <- as.integer(sapply(strsplit(tsv, "\t"), `[[`, 7)) + genes <- sapply(strsplit(tsv, "\t"), "[[", 1) + featcounts <- as.integer(sapply(strsplit(tsv, "\t"), "[[", 7)) names(featcounts) <- genes return(featcounts) } -# We can use do.call() and cbind() to make each result a column of read counts: +# We use lapply(), cbind(), and do.call() to add each column of read counts: counts_GSMs <- data.frame(do.call(cbind, lapply(GSM_URLs, fetch_featcounts))) dim(counts_GSMs) # [1] 41786 22 @@ -239,7 +239,11 @@ Is there a function in R that will check whether two objects are identical? The `identical()` function verifies that two objects in R are the same, save for their name. ```{r check-geo-data-solution} + +# Don't bother checking the raw data if the dimensions don't match. stopifnot(identical(dim(counts_cerebellum), dim(counts_GSMs))) + +# If the dimensions do match, check if the data matches, too. identical(counts_cerebellum, counts_GSMs) # [1] TRUE ``` From ff4a0d6515e4d6f463cdf67096ef42e443726122 Mon Sep 17 00:00:00 2001 From: "Tim Triche, Jr." Date: Mon, 22 Jul 2024 06:35:52 -0400 Subject: [PATCH 3/4] install GEOquery if not yet installed like it says on the tin. should ensure build check passes... --- episodes/02-setup.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/episodes/02-setup.Rmd b/episodes/02-setup.Rmd index afdddd6c..38a6ce87 100644 --- a/episodes/02-setup.Rmd +++ b/episodes/02-setup.Rmd @@ -209,6 +209,8 @@ dim(counts_cerebellum) GSMs_cerebellum <- colnames(counts_cerebellum) # GEOquery can look up the raw data locations for these GSMs. +if (!require("BiocManager")) install.packages("BiocManager") +if (!require("GEOquery")) BiocManager::install("GEOquery") library(GEOquery) GSM_URLs <- sapply(GSMs_cerebellum, function(GSM) getGEOSuppFiles(GSM, fetch=FALSE)[1, "url"]) From e8cc792607be1ba341cb2ad0c767624939242eb0 Mon Sep 17 00:00:00 2001 From: "Tim Triche, Jr." Date: Mon, 22 Jul 2024 10:40:00 -0400 Subject: [PATCH 4/4] get rid of GEOquery dependency Check that compiled counts are the same as from the raw data. --- episodes/02-setup.Rmd | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/episodes/02-setup.Rmd b/episodes/02-setup.Rmd index 38a6ce87..4375ee86 100644 --- a/episodes/02-setup.Rmd +++ b/episodes/02-setup.Rmd @@ -209,11 +209,42 @@ dim(counts_cerebellum) GSMs_cerebellum <- colnames(counts_cerebellum) # GEOquery can look up the raw data locations for these GSMs. -if (!require("BiocManager")) install.packages("BiocManager") -if (!require("GEOquery")) BiocManager::install("GEOquery") -library(GEOquery) -GSM_URLs <- sapply(GSMs_cerebellum, - function(GSM) getGEOSuppFiles(GSM, fetch=FALSE)[1, "url"]) +# Unfortunately, the dependencies for it crash this build! +# We will illustrate the procedure and then use the results. +if (FALSE) { + if (!require("BiocManager")) install.packages("BiocManager") + if (!require("GEOquery")) BiocManager::install("GEOquery") + library(GEOquery) + getSuppURL <- function(GSM) GEOquery::getGEOSuppFiles(GSM, fetch=FALSE)[1, "url"] + GSM_URLs <- sapply(GSMs_cerebellum, getSuppURL) +} else { + GSM_URLprefix <- "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2545nnn" + GSM_URLsep <- "suppl" + GSM_files <- c("GSM2545336_10C_CTGAAGCT-GTACTGAC_L00M_featCounts.txt.gz", + "GSM2545337_11C_TAATGCGC-TATAGCCT_L00M_featCounts.txt.gz", + "GSM2545338_12C_TAATGCGC-ATAGAGGC_L00M_featCounts.txt.gz", + "GSM2545339_13C_TAATGCGC-CCTATCCT_L00M_featCounts.txt.gz", + "GSM2545340_14C_TAATGCGC-GGCTCTGA_L00M_featCounts.txt.gz", + "GSM2545341_17C_TAATGCGC-AGGCGAAG_L00M_featCounts.txt.gz", + "GSM2545342_1C_CTGAAGCT-TATAGCCT_L00M_featCounts.txt.gz", + "GSM2545343_20C_TAATGCGC-GTACTGAC_L00M_featCounts.txt.gz", + "GSM2545344_21C_CGGCTATG-TATAGCCT_L00M_featCounts.txt.gz", + "GSM2545345_22C_CGGCTATG-ATAGAGGC_L00M_featCounts.txt.gz", + "GSM2545346_25C_CGGCTATG-CCTATCCT_L00M_featCounts.txt.gz", + "GSM2545347_26C_CGGCTATG-GGCTCTGA_L00M_featCounts.txt.gz", + "GSM2545348_27C_CGGCTATG-AGGCGAAG_L00M_featCounts.txt.gz", + "GSM2545349_28C_CGGCTATG-TAATCTTA_L00M_featCounts.txt.gz", + "GSM2545350_29C_CGGCTATG-CAGGACGT_L00M_featCounts.txt.gz", + "GSM2545351_2C_CTGAAGCT-ATAGAGGC_L00M_featCounts.txt.gz", + "GSM2545352_30C_CGGCTATG-GTACTGAC_L00M_featCounts.txt.gz", + "GSM2545353_3C_CTGAAGCT-CCTATCCT_L00M_featCounts.txt.gz", + "GSM2545354_4C_CTGAAGCT-GGCTCTGA_L00M_featCounts.txt.gz", + "GSM2545362_5C_CTGAAGCT-AGGCGAAG_L00M_featCounts.txt.gz", + "GSM2545363_6C_CTGAAGCT-TAATCTTA_L00M_featCounts.txt.gz", + "GSM2545380_9C_CTGAAGCT-CAGGACGT_L00M_featCounts.txt.gz") + GSM_URLs <- paste(GSM_URLprefix, GSMs_cerebellum, "suppl", GSM_files, sep="/") + names(GSM_URLs) <- GSMs_cerebellum +} # To extract the read counts per gene for each sample, we write a small function. fetch_featcounts <- function(GSM_URL) {