diff --git a/episodes/02-setup.Rmd b/episodes/02-setup.Rmd index 5e78ce78..4375ee86 100644 --- a/episodes/02-setup.Rmd +++ b/episodes/02-setup.Rmd @@ -185,6 +185,107 @@ download.file( ) ``` + +::::::::::::::::::::::::::::::::::::::: challenge + +## Check that the data file matches the raw data on GEO + +This is a substantial challenge. You don't need to do it! But you needn't trust us, either. +You can verify that the raw data deposited on GEO is directly comparable to the prepared CSV. +To look up raw data for samples (GSMxxxx) on GEO, we can use the [GEOquery](https://doi.org/10.1093/bioinformatics/btm254) package. +The `GEOquery::getGEOSuppFiles()` function then allows one to download or read the raw data files in question. +The tricky part of this is writing a function to extract the data of interest from the raw data files on GEO. +Once we have that function, we can check that the provided CSV file contains the same counts as the raw data. + +```{r check-geo-data} + +# Below, we read the summarized data file into a data.frame and inspect its dimensions (how many rows, how many columns). +counts_URL <- "https://github.com/carpentries-incubator/bioc-rnaseq/raw/main/episodes/data/GSE96870_counts_cerebellum.csv" +counts_cerebellum <- read.csv(url(counts_URL), row.names=1) +dim(counts_cerebellum) +# [1] 41786 22 + +# Each column in the data.frame represents one of 22 GSMs (GEO Samples). +GSMs_cerebellum <- colnames(counts_cerebellum) + +# GEOquery can look up the raw data locations for these GSMs. +# Unfortunately, the dependencies for it crash this build! +# We will illustrate the procedure and then use the results. +if (FALSE) { + if (!require("BiocManager")) install.packages("BiocManager") + if (!require("GEOquery")) BiocManager::install("GEOquery") + library(GEOquery) + getSuppURL <- function(GSM) GEOquery::getGEOSuppFiles(GSM, fetch=FALSE)[1, "url"] + GSM_URLs <- sapply(GSMs_cerebellum, getSuppURL) +} else { + GSM_URLprefix <- "https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2545nnn" + GSM_URLsep <- "suppl" + GSM_files <- c("GSM2545336_10C_CTGAAGCT-GTACTGAC_L00M_featCounts.txt.gz", + "GSM2545337_11C_TAATGCGC-TATAGCCT_L00M_featCounts.txt.gz", + "GSM2545338_12C_TAATGCGC-ATAGAGGC_L00M_featCounts.txt.gz", + "GSM2545339_13C_TAATGCGC-CCTATCCT_L00M_featCounts.txt.gz", + "GSM2545340_14C_TAATGCGC-GGCTCTGA_L00M_featCounts.txt.gz", + "GSM2545341_17C_TAATGCGC-AGGCGAAG_L00M_featCounts.txt.gz", + "GSM2545342_1C_CTGAAGCT-TATAGCCT_L00M_featCounts.txt.gz", + "GSM2545343_20C_TAATGCGC-GTACTGAC_L00M_featCounts.txt.gz", + "GSM2545344_21C_CGGCTATG-TATAGCCT_L00M_featCounts.txt.gz", + "GSM2545345_22C_CGGCTATG-ATAGAGGC_L00M_featCounts.txt.gz", + "GSM2545346_25C_CGGCTATG-CCTATCCT_L00M_featCounts.txt.gz", + "GSM2545347_26C_CGGCTATG-GGCTCTGA_L00M_featCounts.txt.gz", + "GSM2545348_27C_CGGCTATG-AGGCGAAG_L00M_featCounts.txt.gz", + "GSM2545349_28C_CGGCTATG-TAATCTTA_L00M_featCounts.txt.gz", + "GSM2545350_29C_CGGCTATG-CAGGACGT_L00M_featCounts.txt.gz", + "GSM2545351_2C_CTGAAGCT-ATAGAGGC_L00M_featCounts.txt.gz", + "GSM2545352_30C_CGGCTATG-GTACTGAC_L00M_featCounts.txt.gz", + "GSM2545353_3C_CTGAAGCT-CCTATCCT_L00M_featCounts.txt.gz", + "GSM2545354_4C_CTGAAGCT-GGCTCTGA_L00M_featCounts.txt.gz", + "GSM2545362_5C_CTGAAGCT-AGGCGAAG_L00M_featCounts.txt.gz", + "GSM2545363_6C_CTGAAGCT-TAATCTTA_L00M_featCounts.txt.gz", + "GSM2545380_9C_CTGAAGCT-CAGGACGT_L00M_featCounts.txt.gz") + GSM_URLs <- paste(GSM_URLprefix, GSMs_cerebellum, "suppl", GSM_files, sep="/") + names(GSM_URLs) <- GSMs_cerebellum +} + +# To extract the read counts per gene for each sample, we write a small function. +fetch_featcounts <- function(GSM_URL) { + message("Processing ", GSM_URL) + tsv <- readLines(gzcon(url(GSM_URL)))[-1][-1] # drop header and colnames + genes <- sapply(strsplit(tsv, "\t"), "[[", 1) + featcounts <- as.integer(sapply(strsplit(tsv, "\t"), "[[", 7)) + names(featcounts) <- genes + return(featcounts) +} + +# We use lapply(), cbind(), and do.call() to add each column of read counts: +counts_GSMs <- data.frame(do.call(cbind, lapply(GSM_URLs, fetch_featcounts))) +dim(counts_GSMs) +# [1] 41786 22 + +``` + +Above we demonstrate that the numbers of genes and samples are the same. +However, that does not demonstrate that the read counts are the same. +Is there a function in R that will check whether two objects are identical? + +::::::::::::::::::::::::::::::::::::: solution + +The `identical()` function verifies that two objects in R are the same, save for their name. + +```{r check-geo-data-solution} + +# Don't bother checking the raw data if the dimensions don't match. +stopifnot(identical(dim(counts_cerebellum), dim(counts_GSMs))) + +# If the dimensions do match, check if the data matches, too. +identical(counts_cerebellum, counts_GSMs) +# [1] TRUE +``` + +The prepared data file does indeed contain the same counts per gene as the raw GEO data files. + +::::::::::::::::::::::::::::::::::::: + + If you navigate to the `data` folder in your working directory, you should now find a file named `GSE96870_counts_cerebellum.csv`.