diff --git a/CONTRIBUTORS.yaml b/CONTRIBUTORS.yaml index 34dccefc7c6b12..c7cd83b66db96e 100644 --- a/CONTRIBUTORS.yaml +++ b/CONTRIBUTORS.yaml @@ -319,6 +319,12 @@ cfusterbarcelo: orcid: 0000-0002-4784-6957 bio: Post-doctoral researcher at UC3M +Camila-goclowski: + name: Camila Goclowski + joined: 2023-01 + email: camila.goclowski@gmail.com + linkedin: camila-goclowski + charitylaw: name: Charity Law joined: 2018-09 diff --git a/Gemfile.lock b/Gemfile.lock index 6ff976f294df95..c7d6bf24498f29 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,15 +1,8 @@ GEM remote: https://rubygems.org/ specs: - Ascii85 (1.1.0) - addressable (2.8.4) + addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) - afm (0.2.2) - async (2.6.2) - console (~> 1.10) - fiber-annotation - io-event (~> 1.1) - timers (~> 4.1) awesome_bot (1.20.0) parallel (= 1.20.1) bibtex-ruby (6.0.0) @@ -23,9 +16,6 @@ GEM commander (4.6.0) highline (~> 2.0.0) concurrent-ruby (1.2.2) - console (1.17.2) - fiber-annotation - fiber-local csl (2.0.0) namae (~> 1.0) rexml @@ -38,18 +28,15 @@ GEM ffi (>= 1.15.0) eventmachine (1.2.7) fastimage (2.2.7) - ffi (1.15.5) - fiber-annotation (0.2.0) - fiber-local (1.0.0) + ffi (1.16.2) forwardable-extended (2.6.0) - google-protobuf (3.23.3-x86_64-linux) - hashery (2.1.2) + google-protobuf (3.24.3-x86_64-linux) highline (2.0.3) - html-proofer (5.0.7) + html-proofer (4.4.3) addressable (~> 2.3) - async (~> 2.1) + mercenary (~> 0.3) nokogiri (~> 1.13) - pdf-reader (~> 2.11) + parallel (~> 1.10) rainbow (~> 3.0) typhoeus (~> 1.3) yell (~> 2.0) @@ -57,7 +44,6 @@ GEM http_parser.rb (0.8.0) i18n (1.14.1) concurrent-ruby (~> 1.0) - io-event (1.2.2) jekyll (4.3.2) addressable (~> 2.4) colorator (~> 1.0) @@ -94,40 +80,33 @@ GEM rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.4.0) namae (1.1.1) - nokogiri (1.15.2-x86_64-linux) + nokogiri (1.15.4-x86_64-linux) racc (~> 1.4) parallel (1.20.1) pathutil (0.16.2) forwardable-extended (~> 2.6) - pdf-reader (2.11.0) - Ascii85 (~> 1.0) - afm (~> 0.2.1) - hashery (~> 2.0) - ruby-rc4 - ttfunk - pkg-config (1.5.2) - public_suffix (5.0.1) + pkg-config (1.5.5) + public_suffix (5.0.3) racc (1.7.1) rainbow (3.1.1) + rake (13.0.6) rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.5) - rouge (4.1.2) - ruby-rc4 (0.1.5) + rexml (3.2.6) + rouge (4.1.3) safe_yaml (1.0.5) - sass-embedded (1.63.5-x86_64-linux-gnu) + sass-embedded (1.68.0) google-protobuf (~> 3.23) + rake (>= 13.0.0) terminal-table (3.0.2) unicode-display_width (>= 1.1.1, < 3) - timers (4.3.5) - ttfunk (1.7.0) typhoeus (1.4.0) ethon (>= 0.9.0) - unicode-display_width (2.4.2) + unicode-display_width (2.5.0) webrick (1.8.1) yell (2.2.2) - zeitwerk (2.6.8) + zeitwerk (2.6.12) PLATFORMS x86_64-linux @@ -150,4 +129,4 @@ DEPENDENCIES webrick BUNDLED WITH - 2.4.2 + 2.4.20 diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot1.png b/topics/single-cell/images/scrna-SeuratRStudio/plot1.png new file mode 100644 index 00000000000000..de64447fe17d16 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot1.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot10.png b/topics/single-cell/images/scrna-SeuratRStudio/plot10.png new file mode 100644 index 00000000000000..af883f701bf7bf Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot10.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot11.png b/topics/single-cell/images/scrna-SeuratRStudio/plot11.png new file mode 100644 index 00000000000000..caf824896dfdf3 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot11.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot12.png b/topics/single-cell/images/scrna-SeuratRStudio/plot12.png new file mode 100644 index 00000000000000..908995b2f1dfb8 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot12.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot13.png b/topics/single-cell/images/scrna-SeuratRStudio/plot13.png new file mode 100644 index 00000000000000..49be18d0baaa3a Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot13.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot14.png b/topics/single-cell/images/scrna-SeuratRStudio/plot14.png new file mode 100644 index 00000000000000..1ec2aa2b2f643e Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot14.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot15.png b/topics/single-cell/images/scrna-SeuratRStudio/plot15.png new file mode 100644 index 00000000000000..e02f7531b3760d Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot15.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot16.png b/topics/single-cell/images/scrna-SeuratRStudio/plot16.png new file mode 100644 index 00000000000000..4409e65390837c Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot16.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot17.png b/topics/single-cell/images/scrna-SeuratRStudio/plot17.png new file mode 100644 index 00000000000000..aeb5abe77ed2fe Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot17.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot18.png b/topics/single-cell/images/scrna-SeuratRStudio/plot18.png new file mode 100644 index 00000000000000..42a25747bd45b4 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot18.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot19.png b/topics/single-cell/images/scrna-SeuratRStudio/plot19.png new file mode 100644 index 00000000000000..9180e103887722 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot19.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot2.png b/topics/single-cell/images/scrna-SeuratRStudio/plot2.png new file mode 100644 index 00000000000000..6d77203c4f051e Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot2.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot20.png b/topics/single-cell/images/scrna-SeuratRStudio/plot20.png new file mode 100644 index 00000000000000..40302e511e0496 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot20.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot21.png b/topics/single-cell/images/scrna-SeuratRStudio/plot21.png new file mode 100644 index 00000000000000..b25e20e162c652 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot21.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot3.png b/topics/single-cell/images/scrna-SeuratRStudio/plot3.png new file mode 100644 index 00000000000000..7352a03805915d Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot3.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot4.png b/topics/single-cell/images/scrna-SeuratRStudio/plot4.png new file mode 100644 index 00000000000000..7dc7a0172b92a8 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot4.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot5.png b/topics/single-cell/images/scrna-SeuratRStudio/plot5.png new file mode 100644 index 00000000000000..22b476fad949d8 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot5.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot6.png b/topics/single-cell/images/scrna-SeuratRStudio/plot6.png new file mode 100644 index 00000000000000..2ecf614dfa3de0 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot6.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot7.png b/topics/single-cell/images/scrna-SeuratRStudio/plot7.png new file mode 100644 index 00000000000000..7e601acf8978b4 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot7.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot8.png b/topics/single-cell/images/scrna-SeuratRStudio/plot8.png new file mode 100644 index 00000000000000..ecbd38f895fbd8 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot8.png differ diff --git a/topics/single-cell/images/scrna-SeuratRStudio/plot9.png b/topics/single-cell/images/scrna-SeuratRStudio/plot9.png new file mode 100644 index 00000000000000..27a9a8915791f8 Binary files /dev/null and b/topics/single-cell/images/scrna-SeuratRStudio/plot9.png differ diff --git a/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/faqs/index.md b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/faqs/index.md new file mode 100644 index 00000000000000..9ce3fe4fce824b --- /dev/null +++ b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/faqs/index.md @@ -0,0 +1,3 @@ +--- +layout: faq-page +--- diff --git a/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/preamble.md b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/preamble.md new file mode 100644 index 00000000000000..e9b259fd4bdd26 --- /dev/null +++ b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/preamble.md @@ -0,0 +1,78 @@ +# Introduction + +You’ve previously done all the work to make a single cell matrix. Now it’s time to fully process our data using Seurat: remove low quality cells, reduce the many dimensions of data that make it difficult to work with, and ultimately try to define clusters and find some biological meaning and insights! There are many packages for analysing single cell data - Seurat ({% cite Satija2015 %}), Scanpy ({% cite Wolf2018 %}), Monocle ({% cite Trapnell2014 %}), Scater ({% cite McCarthy2017 %}), and many more. We’re working with Seurat in RStudio because it is well updated, broadly used, and highly trusted within the field of bioinformatics. + +> +> This tutorial is significantly based on the Seurat documentation({% cite Satija2015 %}) as well as [Seurat's Guided Clustering Tutorial](https://satijalab.org/seurat/articles/pbmc3k_tutorial.html). +{: .comment} + +> +> +> In this tutorial, we will cover: +> +> 1. TOC +> {:toc} +> +{: .agenda} + +We’ll provided you with experimental data to analyse from a mouse dataset of fetal growth restriction ({% cite Bacon2018 %}). This is the full dataset generated from [this tutorial]({% link topics/single-cell/tutorials/scrna-case_alevin/tutorial.md %}). + +# Get Data onto Galaxy +To start, let's get our dataset loaded into Galaxy. + +You can access the data for this tutorial in multiple ways: +1. **EBI Data Retrieval** - You may retrieve that files necessary to construct a Seurat Object in this way.Doing to will alleviate the necessity to convert AnnData (Python) objects into Seurat (R) objects: + +> GetData +> +> Run{% tool [EBI SCXA Data Retrieval](toolshed.g2.bx.psu.edu/repos/ebi-gxa/retrieve_scxa/retrieve_scxa/v0.0.2+galaxy2) %} with the following parameters: +> - *"SC-Atlas experiment accession"*: `E-MTAB-6945` +> - *"Choose the type of matrix to download"*: `Raw filtered counts` +{: .hands_on} + +2. **Importing from a history** - You can import [this history](https://usegalaxy.eu/u/camila-goclowski/h/fpe) + + {% snippet faqs/galaxy/histories_import.md %} +This also alleviates the necessity to convert the AnnData object into a Seurat one. + +3. **Uploading from Zenodo** (see below) + +> Option 3: Uploading from Zenodo +> +> 1. Create a new history for this tutorial +> 2. Import the AnnData object from [Zenodo]({{ page.zenodo_link }}) +> +> ``` +> {{ page.zenodo_link }}/files/Mito-counted_AnnData +> ``` +> +> {% snippet faqs/galaxy/datasets_import_via_link.md %} +> +> 3. **Rename** {% icon galaxy-pencil %} the datasets `Mito-counted AnnData` +> 4. Check that the datatype is `h5ad` +> +> {% snippet faqs/galaxy/datasets_change_datatype.md datatype="h5ad" %} +> +{: .hands_on} + +# Important tips for easier analysis + +{% snippet faqs/galaxy/tutorial_mode.md %} + +> +> - The Galaxy tool search panel sometimes doesn't find the tools we need from the thousands available. +> - You'll have a much easier time selecting tools from the panel (if you aren't using tutorial mode!) if you are on the [https://humancellatlas.usegalaxy.eu](https://humancellatlas.usegalaxy.eu) +{: .comment} + +# Open RStudio in Galaxy +You now should have imported the matrix.mtx, genes.tsv, barcodes.tsv, and exp_design.tsv files into your Galaxy history. For the rest of the workflow, let's move onto RStudio and get coding! +> Open RStudio in Galaxy +> Run {% tool [RStudio](interactive_tool_rstudio)%} +{: .hands_on} + + +>Next Step +> The interactive RStudio tool should begin to load now. Make your way over to your Active Interactive Tools page (User (in the top bar of the usegalaxy page)> Active Interactive Tools > RStudio) +> +>Alternatively, you may use the view (eye) button in your Galaxy History to open the interactive RStudio environment. +{: .comment} diff --git a/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/tutorial.bib b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/tutorial.bib new file mode 100644 index 00000000000000..078f3530483dd0 --- /dev/null +++ b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/tutorial.bib @@ -0,0 +1,78 @@ +# This is the bibliography file for your tutorial. +# +# To add bibliography (bibtex) entries here, follow these steps: +# 1) Find the DOI for the article you want to cite +# 2) Go to https://doi2bib.org and fill in the DOI +# 3) Copy the resulting bibtex entry into this file +# +# To cite the example below, in your tutorial.md file +# use {% cite Batut2018 %} +# +# If you want to cite an online resourse (website etc) +# you can use the 'online' format (see below) +# +# You can remove the examples below + +@article{Bacon2018, + doi = {10.3389/fimmu.2018.02523}, + url = {https://doi.org/10.3389/fimmu.2018.02523}, + year = {2018}, + month = nov, + publisher = {Frontiers Media {SA}}, + volume = {9}, + author = {Wendi A. Bacon and Russell S. Hamilton and Ziyi Yu and Jens Kieckbusch and Delia Hawkes and Ada M. Krzak and Chris Abell and Francesco Colucci and D. Stephen Charnock-Jones}, + title = {Single-Cell Analysis Identifies Thymic Maturation Delay in Growth-Restricted Neonatal Mice}, + journal = {Frontiers in Immunology} +} + +@article{Wolf2018, + doi = {10.1186/s13059-017-1382-0}, + url = {https://doi.org/10.1186/s13059-017-1382-0}, + year = {2018}, + month = feb, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {1}, + author = {F. Alexander Wolf and Philipp Angerer and Fabian J. Theis}, + title = {{SCANPY}: large-scale single-cell gene expression data analysis}, + journal = {Genome Biology} +} + +@article{Satija2015, + doi = {10.1038/nbt.3192}, + url = {https://doi.org/10.1038/nbt.3192}, + year = {2015}, + month = May, + publisher = {Nature Anerica Inc. All rights reserved}, + volume = {33}, + number = {5}, + author = {Rahul Satija and Jeffrey A. Farrell and David Gennert and Alexander F Schier and Aviv Regev}, + title = {Spatial reconstruction of single-cell gene expression data}, + journal = {Nature Biotechnology} +} + +@article{Trapnell2014, + doi = {10.1038/nbt.2859}, + url = {https://doi.org/10.1038/nbt.2859}, + year = {2014}, + month = April, + publisher = {Nature Anerica Inc. All rights reserved}, + volume = {32}, + number = {4}, + author = {Cole Trapnell and Davide Cacchiarelli and Jonna Grimsby and Prapti Pokharel and Shuqiang Li and Michael Morse and Niall J Lennon and Kenneth J Livak and Tarjei S Mikkelsen and John L Rinn}, + title = {Pseudo-temporal ordering of individual cells reveals dynamics and regulators of cell fate decisions}, + journal = {Nature Biotechnology} +} + +@article{McCarthy2017, + doi = {10.1093/bioinformatics/btw777}, + url = {https://doi.org/10.1093/bioinformatics/btw777}, + year = {2017}, + month = January, + publisher = {Advanced Access Publication}, + volume = {33}, + number = {8}, + author = {Davis J McCarthy and Kieran R Campbell and Aaron T L Lun and Quin F Wills}, + title = {Scater: pre-processing, quality control, normalization and visualization of single-cell RNA-seq data in R}, + journal = {Bioinformatics} +} diff --git a/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/tutorial.md b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/tutorial.md new file mode 100644 index 00000000000000..0613c91ae2d937 --- /dev/null +++ b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/tutorial.md @@ -0,0 +1,704 @@ +--- +layout: tutorial_hands_on + +title: 'Filter, Plot, and Explore with Seurat in RStudio' +subtopic: single-cell-CS +priority: 6 +zenodo_link: 'https://zenodo.org/record/7053673' + +questions: +- Is my single cell dataset a quality dataset? +- How do I pick thresholds and parameters in my analysis? What’s a “reasonable” number, and will the world collapse if I pick the wrong one? +- How do I generate and annotate cell clusters? + +objectives: +- Interpret quality control plots to direct parameter decisions +- Repeat analysis from matrix to clustering to labelling clusters +- Identify decision-making points +- Appraise data outputs and decisions +- Explain why single cell analysis is an iterative process (i.e. the first plots you generate are not final, but rather you go back and re-analyse your data repeatedly) + +time_estimation: 3H + +key_points: +- Being able to switch between Galaxy and RStudio when analyzing datasets can be useful when looking to adjust default parameters within Seurat's functions and workflow. +- Seurat in RStudio gives more flexibility and specificity of analyses, but Galaxy offers great reproducibility and ease of analysis. +- Beginning to learn the syntax and use of R will expand your + +requirements: +- + type: "internal" + topic_name: single-cell + tutorials: + - scrna-case_alevin-combine-datasets + - scrna-case_basic-pipeline + + +tags: +- single-cell +- seurat +- rstudio + +contributions: + authorship: + - Camila-goclowski + editing: + - nomadscientist + - hexylena + - mtekman + - shiltemann + - pavanvidem + + + +notebook: + language: r + snippet: topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/preamble.md +--- + +# Setting your environment +First thing's first, we need to load the packages we will be using. In order to use any functions of a package, we must first call the library of that package. In your console (likely in the lower left corner of your RStudio window), run the following lines of code to do so: + +```r +library(Matrix) +library(Seurat) +library(dplyr) +``` + +The packages are called--now let's get our data files moved from our Galaxy history and into our RStudio enviornment so that we can create a Seurat object. + +# Upload, view and modify the files +Now that we've made it into RStudio, and called the packages we'll use, let's begin moving datasets from Galaxy and into our RStudio environment. Galaxy helps us with this by providing the gx_get() function, which will tell us the file paths for the datasets currently occupying our Galaxy histories. + +So, for example, our matrix was the first to be imported. As such, we will ask for the file path of the first piece of data in our history: we will tell gx_get() the number of the dataset in our history that we are interested in, and the function will output the path we need to bring that dataset into R. + +```r +gx_get(1) #get the file path for matrix.mtx: #1 in Galaxy history +``` + +Now we have the file path! We can use the Matrix package, specifically their readMM() function, to read our counts matrix into our environment, using our file path to let it know where to find the matrix. + +```r +matrix.mtx<-readMM("/import/1") +``` +Now we will do the same thing with the feature, barcode, and experimental design files. + +> Don't skip ahead! +> Don't try to skip ahead and fill in the position of the dataset, reading in the files very often will not work without having used the gx_get() function--if it does, well, lucky you. +> +{: .warning} + +We'll run the same command, simply replacing the data number in order to ask for the file path to data in an alternative slot in our Galaxy history. Then, we will use the read.delim() function in read in the list of genes, cells, and cell annotations provided to us by the researchers: +```r +gx_get(2) #genes.tsv +genes.tsv<-read.delim("/import/2", header = FALSE) + +gx_get(3) #barcodes.tsv +barcodes.tsv<-read.delim("/import/3", header = FALSE) + +gx_get(4) #exp_design.tsv +exp_design.tsv<-read.delim("/import/4") +``` +The format of the experimental design dataset contains cell barcodes as the first column of data with simply numbered (1-n) row names. In order for Seurat to properly use this dataset, we will need to make the cell barcodes the row names. This can be accomplished by running the following line of code: + +```r +rownames(exp_design.tsv)<-exp_design.tsv$Assay +``` + +Now, in our RStudio environment, we should have all of the data sets necessary to create a Seurat Object: the matrix, a file with feature (gene) names, a file with cell barcodes, and an optional, but highly useful, experimental design file containing sample (cell-level) metadata. + +# Generating a Seurat object +Next we will add row and column names to our matrix. In the end, this will leave us with a matrix whose rows are genes, columns are cell barcodes, and each value in the matrix represent the expression value of a given gene in a given cell. + +>About Seurat Objects +>The order of the dimensions, such that genes are the first and cells are the second is a set characteristic of Seurat objects. +> +>You will recieve an error if you try to label the first dimension with cell barcodes or the second with genes. This is because the Dimname slots are like empty fill in the blanks: if the number of labels input doesn't match the number of blanks in that dimension, that is the number of labels don't match the number of cells/genes, Seurat will not accept the labels. +{: .tip} + +```r +dimnames(matrix.mtx) <- list(genes.tsv$V2, barcodes.tsv$V1) +``` + +In a more typical Seurat pipeline, or on a local version of RStudio, this step would be replaced with a Read10x() step. Read10x() is Seurat's function to create a matrix and add in feature and barcode names simultaneously. However, due to the nature of how Galaxy's histories and interactive environments communicate with one another, we'll use this manual method. + +>How to Use Read10X +>The only necessary parameter of Seurat's Read10X() fucntion is the file path to the directory (folder) containing the matrix.mtx, barcodes.tsv, and genes.tsv (sometimes also called features.tsv) files. +> +>So if the EBI SCXA Retrieval tool were to have output a data directory in the sixth history position, the following code would be able to import a labelled counts matrix called "labelled matrix" into our RStudio environment: +>```r +>gx_get(6) #get the file path +>labelled_matrix<-Read10X(dir = "/import/6") +>``` +> +{: .tip} + +Now we will create a Seurat object using our newly labelled counts matrix! Make sure you have called the Seurat library, first, or RStudio will not recognize the function. + +```r +srt<-CreateSeuratObject(counts = matrix.mtx) +``` + +You've created a Seurat object, congratulations! + +# Adding Cell Level Metadata + + Now that we have an object, we can add in our metadata from our experimental design dataframe (table). This will be useful to us shortly as we begin to visualize our data! + +```r +srt$Sex<-exp_design.tsv$Sample.Characteristic.sex. +srt$Organism<-exp_design.tsv$Sample.Characteristic.organism. +srt$Strain<-exp_design.tsv$Sample.Characteristic.strain. +srt$Developmental.Stage<-exp_design.tsv$Sample.Characteristic.developmental.stage. +srt$Age<-exp_design.tsv$Sample.Characteristic.age. +srt$Individual<-exp_design.tsv$Sample.Characteristic.individual. +srt$Disease<-exp_design.tsv$Sample.Characteristic.disease. +srt$Genotype<-exp_design.tsv$Sample.Characteristic.genotype. +srt$Organism.Part<-exp_design.tsv$Sample.Characteristic.organism.part. +srt$Cell.Type<-exp_design.tsv$Sample.Characteristic.cell.type. +srt$Factor.Value.Genotype<-exp_design.tsv$Factor.Value.genotype. +``` +>Syntax Lesson +>The code preceding the left pointing arrow will indicate where to put your metadata (the name of your new metadata column: object$new_metadata_columnname), and the code following the arrow will denote where to find that metadata information (metadata_table$columnname) +> +{: .tip} + +Now that we have our almost fully annotated object, we will add one more metadata column: percent mitochondrial (perc.mt). This metadata column will denote what percentage of a cell's feature (gene) expression is mitochondrial--which will be useful to us shortly as we begin to filter our data. + +```r +srt <- PercentageFeatureSet(srt, pattern = "^mt-", col.name = "perc.mt") +``` + +For the sake of this data set, and many others, the mitochondrial genes will all be marked with an "mt" as the prefix, so that is how we have asked Seurat's PercentageFeatureSet() function to search for mitochondrial genes in the line of code above. + +With that being said, when you are analyzing your own data, it is highly recommended that you identify how your data set has labelled mitochondrial genes to ensure that you are calculating the correct percentages. + +> Careful +> The "mt" prefix may not always include all mitochondrial genes depending on how your dataset has been labelled. +> +>Sometimes the labels may be done via a capital prefix ("^MT") and in some cases, mitochondrial genes must be identified via feature-level metadata like a GTF file. +{: .warning} + + +# QC Plots +Now that we have a complete Seurat object, we can begin the filtering process. + +There will be a number of ‘cells’ that are actually just empty droplets or low-quality. There will also be genes that could be sequencing artifacts or that appear with such low frequency that statistical tools will fail to accurately analyse them. + +This background noise of both cells and genes not only makes it harder to distinguish real biological information from artifacts, but also makes it computationally demanding to analyze. + +We want to filter our cells, but first we need to know what our data looks like. There are a number of subjective decisions to make within scRNA-seq analysis, for instance we now need to make informed decisions about where to set our thresholds (more on that soon!). + +We’re going to plot our data a few different ways. Different bioinformaticians might prefer to see the data in different ways, and here we are only generating a few of the plots you can use. Ultimately you need to go with what makes the most sense to you. + +So let's generate some QC plots. First off, let's check our dataset for batch effect: + +```r +VlnPlot(srt, group.by = "Individual", + features = "nCount_RNA", + log = TRUE) +``` +![Violin Plot split by Individual/Batch](../../images/scrna-SeuratRStudio/plot1.png "Violin Plot of counts split by Individual.") + +This plot shows us the number of cells split by the individual (mouse) from which the cells came from. Now, depending on your experimental design, batch may be represented by something other than individual--like timepoint or even the wet lab researcher who isolated the cells. + +Ideally, we would like to see a relatively even distribution of counts for each individual (or batch) but if there isn’t, fear not, we can regress this variable out in a later step. + +>Syntax Lesson +>In order to accurately assess potential batch effects, use the "group.by" parameter to indicate the variable which differed across experiments. +> +{: .tip} + +Now let's get an idea of how different variables, like the sex or genotype of the mice, might be represented across our dataset. + +1. Sex? + ```r + VlnPlot(srt, group.by = "Sex",features = "nCount_RNA",log = TRUE) + ``` + ![Violin Plot split by Sex](../../images/scrna-SeuratRStudio/plot2.png "Violin Plot of counts split by Sex.") + +2. Genotype? + ```r + VlnPlot(srt, group.by = "Genotype", features = "nCount_RNA", log = TRUE) + ``` + ![Violin Plot split by Genotype](../../images/scrna-SeuratRStudio/plot3.png "Violin Plot of counts split by Genotype--Mutant versus Control.") + +# Finding Our Filtering Parameters +Now that we have a better understanding of what our data looks like, we can begin identifying those spurious reads and low quality cells and then remove them. First, we'll plot the percent mito (perc.mt) against the cell count (nCount_RNA) to get an idea of what threshold we should set for nCount: + +```r +plot(x = srt$nCount_RNA, y = srt$perc.mt, main = "UMI Counts x Percent Mito", xlab = "UMI_count", ylab = "percent mito") +``` +![UMI x mito](../../images/scrna-SeuratRStudio/plot4.png "UMI counts x Percent mito.") +We are looking for cell counts with high mitochondrial percentages in their feature expression. + +>High Mitochondrial Reads +>High mito expression will typically indicate stressed out cells (often due to the extraction, sorting, or sample prep protocols). +> +{: .comment} + +These cells won't tell us much biologically, rather, they will contribute noise that we'll aim to filter out of our data. With that being said, there is a level of metabolic activity that is expected but will be specific to your samples/tissue/organism--so it is worth looking into what that might look like when it comes time to analyze your own data. + +We can also zoom in on the x-axis to get a better idea of what threshold to set by adjusting the xlim parameter: + +```r +plot(x = srt$nCount_RNA, y = srt$perc.mt, main = "UMI Counts x Percent Mito", xlab = "UMI_count", ylab = "percent mito", xlim = c(0,1750)) +``` +![UMI x mito zoomed in on X](../../images/scrna-SeuratRStudio/plot5.png "UMI counts x Percent mito-Zoomed in on X.") + +>Interpretations +>It looks like just before nCount_RNA = 1750, the perc.mito peaks above 2 percent--a conservative threshold. +> +{: .comment} + +Now we can take a closer look at the y-axis to decide on a mito threshold to set. Once more, we want to get rid of as few cells as possible while still removing those with unexpectedly high mito percentages. + +```r +plot(x = srt$nCount_RNA, y = srt$perc.mt, main = "UMI Counts x Percent Mito", xlab = "UMI_count", ylab = "percent mito", ylim = c(0,3)) +``` +![UMI x mito zoomed in Y](../../images/scrna-SeuratRStudio/plot6.png "UMI counts x Percent mito-Zoomed in on Y.") + +>Interpretations +>We can see a clear trend wherein cells that have around 3 percent mito counts or higher also have far fewer total counts. These cells are low quality, will muddy our data, and are likely stressed or ruptured prior to encapsulation in a droplet. +> +{: .comment} + +Take a look at what proportion of cells those thresholds will include and disclude from our dataset: + +```r +prop.table(table(srt@meta.data$nCount_RNA > 1750)) +prop.table(table(srt@meta.data$perc.mt > 3)) +``` + +If we are happy with those thresholds for cells and percent mito, we can look at the the gene count threshold next. + +>Otherwise +>If not, repeat the preceding steps to hone in on a threshold more suited to your needs. +> +{: .comment} + +To set a threshold for gene count, let's plot the gene counts (nFeature_RNA) against the percent mito (perc.mt): + +```r +plot(x = srt$nFeature_RNA, y = srt$perc.mt, main = "Gene Counts x Percent Mito", xlab = "gene_count", ylab = "percent mito") +``` +![Gene x mito](../../images/scrna-SeuratRStudio/plot7.png "Gene counts x Percent mito.") + +Once again, let's zoom in on the x-axis, but this time to get an idea of which nFeature_RNA threshold to set: + +```r +plot(x = srt$nFeature_RNA, y = srt$perc.mt, main = "Gene Counts x Percent Mito", xlab = "gene_count", ylab = "percent mito", xlim = c(0,1275)) +``` +![Gene x mito--zoomed in](../../images/scrna-SeuratRStudio/plot8.png "Gene counts x Percent mito zoomed in.") + +>Interpretations +>You can see how cells with nFeature_RNA up to around, perhaps 575 genes, often have high perc.mt. The same can be said for cells with nFeature_RNA above 1275. +> +>We could also use the violin plots to come up with these thresholds, and thus also take batch into account. It’s good to look at the violins as well, because you don’t want to accidentally cut out an entire sample (i.e. N703 and N707 which both have cell counts on the lower side). +> +{: .comment} + +Now let's take a look at what those nFeature_RNA thresholds will include and disclude from our data. + +```r +prop.table(table(srt@meta.data$nFeature_RNA > 1275 | srt@meta.data$nFeature_RNA < 575)) +``` + +# Applying our Thresholds +Once we are happy with our filtering thresholds, it’s time to apply them to our data! + +>Create a new object +> You will notice in the next line of code, we have indicated a new object name for this filtered (subset) data. This is good practice so that you don't have to start all over in case you decide to change your filtering parameters, which you likely will, or if something goes awry. +> +{: .tip} + +```r +subset_srt<-subset(srt, nCount_RNA > 1750 & nFeature_RNA > 1275 & perc.mt < 3 | nFeature_RNA < 600) +``` + +In this step we are also creating a new object (notice the new object name preceding the subset() function you just ran) so that we may compare back and forth between our unfiltered and filtered data set if we please. + +Next, we want to filter out genes that no longer show any expression in the remaining, high quality cells in our filtered dataset. In order to do so we will extract the filtered matrix from our filtered object. + +```r +subset_matrix<-GetAssayData(subset_srt) +``` + +Since you’ve removed a whole heap of cells, and the captured genes are sporadic (i.e. a small percentage of the overall transcriptome per cell) this means there are a number of genes in your matrix that are not expressed in any of the cells left in our filtered matrix. + +Genes that do not appear in any cell, or even in only 1 or 2 cells, may break some analytical tools and will generally not be biologically informative. So let’s remove them! + +We can use the filtered matrix we just extracted to create a new Seurat object, this time including the argument: min.cells = 3. This will remove any genes from our matrix that have less than 3 cells expressing them. + +>More Thresholds +> Note that 3 is not necessarily the best number, rather it is a fairly conservative threshold. You could go as high as 10 or more. +> +{: .tip} + +```r +filtered_srt <- CreateSeuratObject(counts = subset_matrix, meta.data = subset_srt@meta.data, min.cells = 3) +``` + +Now that we have filtered out both noisy "cells" and genes from our dataset, let's clean up our environment. Remove objects that we no longer need to ensure that we stay organized and RStudio has enough memory capacity to perform downstream analyses. This likely will not be an issue while doing this tutorial, but in practice it will help things run smoothly. + +```r +rm(subset_matrix, subset_srt) +``` + +# Processing +Currently, we still have quite big data. We have two issues here + 1. We already saw in our filtering plots that there are differences in how many transcripts and genes have been counted per cell. This technical variable could, and likely would, distract us from identifying true biological differences. + 2. We like to plot things on 2-dimensional X/Y plots. So, for instance, Gapdh could be on one axis, and Actin could be on another, and then each cell is plotted onto that 2D axis based on how many of each transcript they possess. + +Although that would be fine, adding in a 3rd dimension (or, indeed, in our case, a dimension for each of the thousands of genes), is a bit trickier. + +So, our next steps will be to transform our big data object into something that is easy to analyse and easy to visualize. + +We will run SCTransform, a combinatorial function by Seurat that normalizes the data, finds variable features, and then scales the data. In their initial workflow, and in the Scanpy version of this tutorial, these steps are run individually. However, with the second version of SCTransform comes time efficiency and optimization for downstream analyses. + +```r +filtered_srt<- SCTransform(filtered_srt, vars.to.regress = c("perc.mt", "nFeature_RNA", "nCount_RNA"), verbose = TRUE, return.only.var.genes = FALSE, seed.use = 1448145) +``` +>What is Normalization? +> Normalisation helps reduce the differences between gene and UMI counts by fitting total counts across cells in our data to be comparable to one another. SCTransform regularizes the gene expression profiles via a negative binomial regression while also controlling for overfitting of the data. This step can also be done using Seurat's NormalizeData() function, but would need to be followed by FindVariableFeatures() and ScaleData(). +> +{: .comment} + + +We also have loads of genes, but not all of them vary in expression from cell to cell. For instance, housekeeping genes are defined as not changing much from cell to cell, so we could remove these from our data to simplify our analyses. + +The find variable features step within SCTransform (or Seurat's FindVariableFeatures() function) will flag genes that do vary across cells to expedite future analyses and ensure that we, and Seurat, don't waste time looking for meaningful differences where they don't exist. + +Then, SCTransform (or Seurat's ScaleData() function) will scale the data so that all genes have the same variance and a zero mean. + +This is an important step to set up our data for further dimensionality reduction. It also helps negate sequencing depth differences between samples, since the gene levels across the cells become comparable. + +>Don't Worry! +> Note, that the differences from scaling etc. are not the values you have at the end - i.e. if your cell has average GAPDH levels, it will not appear as a ‘0’ when you calculate gene differences between clusters. +> +{: .comment} + +Although we've made our expression values comparable to one another and our overall dataset less computationally demanding, we still have way too many dimensions (n cells x n genes!). + +Transcript changes are not usually singular--which is to say, genes function and exist in pathways and groups. It would be easier to analyse our data if we could group these differences. To address this we will run principal component analysis (PCA). + +>What is PCA? +>Principal components (PCs) are calculated from highly dimensional data to find the most representative spread in the dataset. So in our highly variable gene dimensions, there will be one line (axis) that yields the most spread and variation across the cells. That will be our first principal component. +{: .comment} + +We can calculate the first handful of principal components in our data to drastically reduce the number of dimensions: + +>Running Computationally Demanding Steps on Variable Features +>You'll notice that the RunPCA() function is run using the variable features from the previous step. This signficantly decreases the number of genes, and their expression changes, that must be grouped into principal components by this step. +{: .tip} + +```r +filtered_srt <- RunPCA(filtered_srt, features = VariableFeatures(object = filtered_srt)) +``` + +To visualize how our principal components (PCs) represent our data, let's create an elbow plot: + +```r +ElbowPlot(filtered_srt, ndims = 50) +``` +![PC Elbow Plot](../../images/scrna-SeuratRStudio/plot9.png "Elbow Plot: Varianvce Explained x PC.") + +>Interpretations +>We can see that there is really not much variation explained past the 9th PC. So we might save ourselves a great deal of time and muddied data by focusing on the top 10 PCs. +{: .comment} + +You can also think about it like choosing a threshold of variance explained. Conservatively, 2.5 standard deviations are explained by about 10 of the PCs. + +We’re still looking at around 10 dimensions at this point--likely not the easiest to visualize. To make our lives easier, we need to identify how similar a cell is to another cell, across every cell across each of these dimensions. + +For this, we will use the k-nearest neighbor (kNN) graph, to identify which cells are close together and which are not. + +The kNN graph plots connections between cells if their distance (when plotted in this 10 dimensional space!) is amongst the k-th smallest distances from that cell to other cells. This will be crucial for identifying clusters, and is necessary for plotting a UMAP--which is what will ultimately allow us to visualize our data in 2 dimensions. + +>From UMAP developers: +>“Larger neighbor values will result in more global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50, with a choice of 10 to 15 being a sensible default”. +{: .comment} + +Let's now use the 10 PC threshold we chose from the Elbowplot and apply it to find neighbors: + +```r +filtered_srt <- FindNeighbors(filtered_srt, dims = 1:10) +``` + +Now we can use the neighborhood graph to identify clusters of cells whose transcriptional profiles appear most similar to one another. + +```r +filtered_srt <- FindClusters(filtered_srt, resolution = 0.5) +``` + +Unfortunately, identifying clusters is not as majestic as biologists often think - the math doesn’t necessarily identify true cell clusters. Every algorithm for identifying cell clusters falls short of a biologist knowing their data, knowing what cells should be there, and proving it in the lab. + +So, we’re going to make the best of it as a starting point and see what happens! We will define clusters from the kNN graph, based on how many connections cells have with one another. Roughly, this will depend on a resolution parameter for how granular you want to be. + +>On Clustering Resolution +>The resolution parameter available in the FindClusters() function allows for you, the bioinformatician, to dictate the granularity of the clusters. +> +>For example, a higher clustering resolution dictates increased granularity, and more stringent clusters. That is--cells must more closely resemble one another in order to be grouped into the same cluster than at a lower clustering resolution. +> +>In general, I find it easiest to think of a higher resolution producing more clusters and conversely, a lower resolution will produce less clusters. This parameter is a useful one that you will use often to help decipher how many true populations of cells are present in your data! +{: .tip} + +Now that we have made note within our object of which cells cluster together, we can start to visualize our data! Two major visualizations for this data type are tSNE and UMAP. We can calculate the coordinates for both prior to visualization. For tSNE, the parameter perplexity can be changed to best represent the data, while for UMAP the main change would be to change the kNN graph above itself, via the FindNeighbors() function. + +>On UMAP +>UMAP is the most recently developed, and most widely used dimensionality reduction for visualization of principal component data. It has been optimized since tSNE to better preserve global structure and is less computationally demanding. +> +{: .tip} + +```r +filtered_srt <- RunUMAP(filtered_srt, dims = 1:10, seed.use = 1323) +``` + +# Let's Take a Look +Now that we have run dimensionality reduction on our dataset, it is ready for visualization. Let's take a look at what our cells look like in a UMAP projection: + +```r +DimPlot(filtered_srt, reduction = "umap", label = TRUE, label.box = TRUE)+ NoLegend() +``` +![DimPlot colored by 0.5 resolution cluster](../../images/scrna-SeuratRStudio/plot10.png "DimPlot colored by 0.5 resolution cluster.") +Good work! It looks like with a clustering resolution of 0.5, we are able to identify 7 clusters of cells in our data. + +We can also look for expression of particular genes and see how those map to our UMAP projection. This is often useful in getting an initial understanding of which clusters might be representative of which cell types. + +```r +FeaturePlot(filtered_srt, features = "Gapdh", order = TRUE) +``` +![FeaturePlot: Gapdh](../../images/scrna-SeuratRStudio/plot11.png "FeaturePlot: Gapdh") + +We just plotted a housekeeping gene, Gapdh, so the broad expression we observe is expected. + +>Weird scale? +> If the scale of your data looks weird, it may be due to the DefaultAssay of your object. When we ran SCTransform, the function creates an entirely new assay within our Seurat object that includes scaled and normalized count values. This SCT Assay is what we want to visualize our expression values off of. So, if your scale is super broad, or goes negative, try running the following command before attempting to plot again: +>```r +>DefaultAssay(filtered_srt)<-"SCT" +>``` +{: .tip} + +In practice, it is helpful to plot known markers of cell types you expect to be in your dataset. This will give you a first look at how your cells are clustered. + +For example, we can plot early T-cell marker Il2ra and get an idea of which cells and/or clusters might resemble the early T-cells: + +```r +FeaturePlot(filtered_srt, features = "Il2ra", order = TRUE) +``` +![FeaturePlot: Aif1](../../images/scrna-SeuratRStudio/plot12.png "FeaturePlot: Il2ra") + +It is a good idea, when analyzing your own data, to plot some markers of cell types you expect to be present. Later on we can also use these FeaturePlots to visualize manual annotation of clusters. + +# Differential Expression Testing: Finding Markers +Because each cluster of cells was grouped based on similar transcriptome profiles, each cluster will inherently differ from one another based on a set of "marker" genes. + +Following an initial look at the DimPlots and FeaturePlots, we can take an even closer look at which genes are driving the clustering. + +In order to do so we can run cluster level differential expression tests. First, we will need to set our object's active identity to be the clusters. This will ensure that when Seurat's differential expression function is run, the groupings of cells across which it will compare are the clusters. + +>What are Identities? +> Identities are, at their core, categorical metadata values. They are columns of cell-level metadata that somehow group the cells together. Examples of identities could be cluster number, cell type if/once known, genotype, etc. +> +{: .tip} + +```r +Idents(filtered_srt)<- filtered_srt$seurat_clusters +``` + +>Syntax Lesson +> There are often many different ways to get the same job done in R, but especially when manipulating Seurat objects. We could alternatively set the active identity of our object with the following line of code too: +> +>```r +>filtered_srt<-SetIdent(object = filtered_srt, value = "seurat_clusters") +>``` +{: .tip} + +Then, we'll run Seurat's FindAllMarkers function, which will compare each identity (in this case cluster) against every other identity within its class (all the other clusters). This function of marker finding is particularly useful in identifying up, or down, regulated genes that drive differences in identity/cluster. + +```r +cluster_markers<-FindAllMarkers(object = filtered_srt) +View(cluster_markers) +``` + +We'll use these marker lists later on to label our cell types. + +We can also see which genes are differentially expressed across other variables in our metadata. For example, you can see which genes are up or down regulated across the different genotypes present in our dataset. To do so, let's first get a list of all the identity classes in our data. This information is kept in the metadata column, and any categorical variable will do. Here, let's pick genotype. + +```r +metadata<-as.data.frame(filtered_srt@meta.data) +filtered_srt<-SetIdent(object = filtered_srt, value = "Genotype") +``` + +The "metadata" object now in your environment is a dataframe with column names representing the different identities you may choose to group your cells by when running differential expression. The second line of code above will set the object's identity class to be the genotype from which the cell came from. + +Now, let's see what genes differentiate our wildtype from our mutant cells. First, we can identify how many different genotypes are in our data: + +```r +unique(filtered_srt$Genotype) +``` + +This output helpfully shows us what the genotypes are, and how they are labelled in our metadata. The small details, like capitalization, are important for referencing metadata information--our references must perfectly match the labelling in the object, otherwise they will not be recognized by the functions. + +Now that we know how our wildtype and mutant cells are labelled, we can use that information to directly compare the two. This time we will use a pairwise comparison method by using Seurat's FindMarkers() function (not to be confused with FindAllMarkers which has a comprehensive comparison approach): + +```r +markers<-FindMarkers(object = filtered_srt, ident.1 = "wild type genotype", ident.2 = "Igf2-p0 heterozygous knockout", test.use = "wilcox") +``` + +The above function will find all of the differentially expressed genes between ident.1 (wildtype) and ident.2 (mutant) using the Wilcoxon test. The resulting output will show genes with positive fold changes (denoting a higher expression in the first identity--wildtype) and negative fold changes (denoting a higher expression value in the second identity--mutant). + +>On Finding Markers +> This same test of differential expression can be run using any identity class and any two identities within the same class. As this is a more fine tuned comparison than FindAllMarkers, it can be useful to uncover differences across specific samples. +{: .comment} + +# Biological Interpretations +Now it’s the fun bit! We can see where genes are expressed, and start considering and interpreting the biology of it. At this point, it’s really about what information you want to get from your data--the following is only the tip of the iceberg. However, a brief exploration is good, because it may help give you ideas going forward for your own data. Let's start interrogating our data! + +Let's take another look at what our clusters look like: + +```r +DimPlot(object = filtered_srt, reduction = "umap", label = TRUE, label.box = TRUE, group.by = "seurat_clusters") + NoLegend() +``` +![DimPlot colored by 0.5 resolution cluster](../../images/scrna-SeuratRStudio/plot10.png "DimPlot colored by 0.5 resolution cluster.") + +>On Cluster Numbering +>Note that Seurat's cluster numbering is based on size alone, so clusters 0 and 1 are not necessarily related, they are just the clusters containing the most cells. +{: .comment} + +It would be nice to know what these cells are. This analysis (googling all of the marker genes, both checking where the ones you know are and then going through marker tables we generated) is a fun task for any individual experiment, so we’re going to speed past that and nab the assessment from the original paper! + +| Clusters | Markers | Cell Type | +|----------|-------------------------|-------------------------------------| +| 3 | Il2ra | Double negative (early T-cell) | +| 1,2,5 | Cd8b1, Cd8a, Cd4 | Double positive (middle T-cell) | +| 0 | Cd8b1, Cd8a, Cd4 - high | Double positive (late middle T-cell)| +| 4 | Itm2a | Mature T-cell | + +Feel free to plot these markers onto our dataset to see where they fall. This is generally a useful method of discerning cell types and can be useful for initial annotations. + +>Plotting Markers +>To do so, simply use the same FeaturePlot() function we used above, but replace the feature parameter with your new marker of interest. +>```r +>FeaturePlot(object = filtered_srt, features = c("Il2ra", "Cd8b1", "Cd8a", "Cd4", "Itm2a"), order = T, ncol = 3) +>``` +>![FeaturePlots of cell type markers](../../images/scrna-SeuratRStudio/plot21.png "FeaturePlots of our known cell type markers") +{: .tip} + +We can then manually label the clusters in whatever way we please. [Dplyr](https://dplyr.tidyverse.org/reference/mutate.html)'s mutate() function allows us to incorporate conditional metadata. That is to say, we can ask the function to label cells based on the cluster in which they have been assigned: + +```r +filtered_srt@meta.data<- mutate(filtered_srt@meta.data, celltype = case_when( + seurat_clusters %in% c(3) ~ "Double negative (early T-cell)", + seurat_clusters %in% c(1,2,5) ~ " Double positive (middle T-cell)", + seurat_clusters %in% c(0) ~ "Double positive (late middle T-cell)", + seurat_clusters %in% c(4) ~ "Mature T-cell" +)) +``` +Once we have labelled our clusters, we can visualize what our cell types actually look like: + +```r +DimPlot(object = filtered_srt, reduction = "umap", group.by = "celltype") +``` +![DimPlot colored by labelled celltype](../../images/scrna-SeuratRStudio/plot13.png "DimPlot colored by assigned cell type") + +Now we can begin to feel a bit more oriented in exploring our data. The clusters are labelled with cell types, and our object has been processed enough such that we may now begin to answer some real biological questions! Now that we know what we’re dealing with, let’s examine the effect of our variable, real science! + +## Keep Digging +Are there any differences in genotype? Or in biological terms, is there an impact of growth restriction on T-cell development in the thymus? We can begin to answer this question visually by using the "split.by" parameter in Seurat's plot functions. + +```r +DimPlot(object = filtered_srt, reduction = "umap", group.by = "celltype", split.by = "Genotype") +``` +![DimPlot colored by labelled celltype split by genotype](../../images/scrna-SeuratRStudio/plot14.png "DimPlot colored by assigned cell typesplit by genotype") + +We can see that there seems to be a decrease in cellcounts across the celltypes in the het mutant... INTERESTING! What next? We might look further at the transcripts present in both those populations, and perhaps also look at the genotype marker table… So much to investigate! But before we set you off to explore to your heart’s delight, let’s also look at this a bit more technically. + +# Technical Assessment +Is our analysis real? Is it right? Well, we can assess that a little bit. + +First thing's first, is there a batch effect? + +```r +DimPlot(object = filtered_srt, reduction = "umap", group.by = "Individual") +``` +![DimPlot colored by labelled celltype split by individual/batch](../../images/scrna-SeuratRStudio/plot15.png "DimPlot colored by assigned cell types split by individual/batch") + +While some differences across batch are expected and nothing to be concerned about, the immature T-cells looks to be mainly comprised of Individual 3. There might be a bit of batch effect, so you could consider using batch correction on this dataset. However, if we focus our attention on the other cluster - mature T-cells - where there is batch mixing, we can still assess this biologically even without batch correction. + +Additionally, we will also look at the confounding effect of sex: + +```r +DimPlot(object = filtered_srt, reduction = "umap", group.by = c("Sex", "Individual", "Genotype")) +``` +![DimPlot colored by Sex, Individual, and Genotype](../../images/scrna-SeuratRStudio/plot16.png "DimPlot colored by Sex, Individual, and Genotype") + + +We note that the one female sample - unfortunately one of merely three knockout samples - seems to be distributed in the same areas as the knockout samples at large, so luckily, this doesn’t seem to be a confounding factor and we can still learn from our data. Ideally, this experiment would be re-run with either more female samples all around or swapping out this female from the male sample. + +Are there any clusters or differences being driven by sequencing depth, a technical and random factor? + +```r +FeaturePlot(object = filtered_srt, reduction = "umap", features = "nCount_SCT") +``` +![FeaturePlot colored by counts](../../images/scrna-SeuratRStudio/plot17.png "FeaturePlot colored by counts") + + +There doesn't visually appear to be any differences in sequencing depth across the clusters, but let's check out some of those other variables we grouped by: + +```r +FeaturePlot(object = filtered_srt, reduction = "umap", features = "nCount_SCT", split.by = "Individual") +``` +![FeaturePlot colored by counts](../../images/scrna-SeuratRStudio/plot18.png "FeaturePlot colored by counts split by Individual") + +There we go! This might explain the dramatic shift in early to middle T-Cell between wildtype and knockout cells--the leftmost early to middle T-cells simply have a higher sequencing depth represented by Individual 3 (UMIs/cell) than the ones on the right side. Well, that explains some of the sub-cluster that we’re seeing in that splurge (specifically this likely accounts for the discernment between clusters 1, 2, and 5). + +Luckily, and importantly, we don’t see the double negative or mature T-cells being similarly affected. So, although, this variable of sequencing depth, or moreso, Individual, might be something to regress out somehow, it doesn’t seem to be impacting our dataset such that we cannot draw meaningful insights. + + +>Overprocessing +>The less you can regress/modify your data, in general, the better--you want to stay as true as you can to the raw data, and only use maths to correct your data when you really need to (and not to create insights where there are none!). +{: .tip} + + +Do you think we processed these samples well enough? We have seen in the previous images that these clusters are not very tight or distinct, so we could consider stronger filtering. Let's take a look at gene expression of a gene we know should not be expressed in tCells as a sanity check: + +```r +FeaturePlot(object = filtered_srt, reduction = "umap", features = "Hba-a1") +``` +![FeaturePlot of Hemoglobin](../../images/scrna-SeuratRStudio/plot19.png "FeaturePlot of Hemoglobin") + + +Hemoglobin--a red blood cell marker that should NOT be found in T-cells--appears throughout the entire dataset in low numbers and as a likely marker of Cluster 6. This suggests that some background noise may have been introduced by the media the cells were in. We might consider in the wet lab trying to get a purer, happier sample, with less background or in the dry lab, we can take advantage of techniques such as SoupX or others to remove this technical noise. + +>Removing Noise +>Adjusting the filtering settings (increasing minimum counts/cell, etc.) is often the place to start in these scenarios. +{: .tip} + +Do you think the clustering is appropriate? i.e. are there single clusters that you think should be separate, and multiple clusters that could be combined? + +```r +CellType_DimPlot<-DimPlot(object = filtered_srt, reduction = "umap", group.by = "celltype") +Cd4_FeaturePlot<-FeaturePlot(object = filtered_srt, reduction = "umap", features = "Cd4") +CellType_DimPlot | Cd4_FeaturePlot +``` +![Double Positive differentiation?](../../images/scrna-SeuratRStudio/plot20.png "Double Positive differentiation?") + +Important to note, lest all bioinformaticians combine forces to attack the biologists: just because a cluster doesn’t look like a cluster by eye is NOT enough to say it’s not a cluster! But looking at the biology here, we struggled to find marker genes to distinguish the double positive populations, which we know are also affected by depth of sequencing. That’s a reasonable argument that Clusters 1, 2, and 5 might not be all that different. Maybe we need more depth of sequencing across all those cells, or to compare these explicitly to each other (consider variations on FindMarkers!). + +However, the late double positive cluster is both seemingly leaving the larger body of clusters and also has fewer knockout cells, so we might go and look at what those cells are expressing in the marker genes. If we look at the mature T-cells further, we can see that their marker gene--Itm2a--is only expressed in half of the cluster. You might consider sub-clustering this to investigate further, either through changing the resolution or through analysing this cluster alone. + +If we look at the differences between genotypes alone (so the pseudo-bulk), we can see that many, if not most, of the genes in that list are actually ribosomal. This could be a housekeeping background, it might be cell cycle related, it may be biological, or some combination of all three. You might consider investigating the cycling status of the cells, or even regressing this out (which is what the authors did). + +Ultimately, there are quite a lot ways to analyse your single-cell data, both within the confines of this tutorial (the many parameters that could be changed throughout) and outside of it (batch correction, sub-clustering, cell-cycle scoring, inferred trajectories, etc.) Most analyses will still yield the same general output, though: there are fewer knockout cells in the mature T-cell population, suggesting some sort of abberant development of T-cells in the Igf2-p0 hets. + +Finally, we can export plots and objects from RStudio back into Galaxy. To do so, we'll use the gx_put() function provided to us by Galaxy. Let's save our Seurat object and the cell type labelled DimPlot! + +```r +gx_put(filtered_srt) +gx_put(CellType_DimPlot) +``` + +The above functions will export your object and the plot into your Galaxy history! + +Congratulations! You have interpreted your plots in several important ways! diff --git a/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/workflows/index.md b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/workflows/index.md new file mode 100644 index 00000000000000..e092e0ae66ddd4 --- /dev/null +++ b/topics/single-cell/tutorials/scrna-case_FilterPlotandExploreRStudio/workflows/index.md @@ -0,0 +1,3 @@ +--- +layout: workflow-list +---