init: data analysis

brickmanlab · Nov 17, 2022 · 76c7f01 · 76c7f01
commit 76c7f01
Show file tree

Hide file tree

Showing 32 changed files with 22,005 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+.DS_Store
+data/
+GEO/
+notebooks/.Rhistory
+reports/figures/
+*.smbdelete*
+*temp*
+*.pdf
diff --git a/CAT/vitro-vs-vivo/vitro-vs-vivo.svg b/CAT/vitro-vs-vivo/vitro-vs-vivo.svg
diff --git a/CAT/vitro-vs-vivo/vitro_vitro_euclidean.xlsx b/CAT/vitro-vs-vivo/vitro_vitro_euclidean.xlsx
diff --git a/CAT/vitro-vs-vivo/vitro_vivo_euclidean.xlsx b/CAT/vitro-vs-vivo/vitro_vivo_euclidean.xlsx
diff --git a/CAT/vitro-vs-vivo/vivo_vitro_euclidean.xlsx b/CAT/vitro-vs-vivo/vivo_vitro_euclidean.xlsx
diff --git a/CAT/vitro-vs-vivo/vivo_vivo_euclidean.xlsx b/CAT/vitro-vs-vivo/vivo_vivo_euclidean.xlsx
diff --git a/CAT/vitrosub-vs-vivo/vitrosub-vs-vivo.svg b/CAT/vitrosub-vs-vivo/vitrosub-vs-vivo.svg
diff --git a/CAT/vitrosub-vs-vivo/vitrosub_vitrosub_euclidean.xlsx b/CAT/vitrosub-vs-vivo/vitrosub_vitrosub_euclidean.xlsx
diff --git a/CAT/vitrosub-vs-vivo/vitrosub_vivo_euclidean.xlsx b/CAT/vitrosub-vs-vivo/vitrosub_vivo_euclidean.xlsx
diff --git a/CAT/vitrosub-vs-vivo/vivo_vitrosub_euclidean.xlsx b/CAT/vitrosub-vs-vivo/vivo_vitrosub_euclidean.xlsx
diff --git a/CAT/vitrosub-vs-vivo/vivo_vivo_euclidean.xlsx b/CAT/vitrosub-vs-vivo/vivo_vivo_euclidean.xlsx
diff --git a/README.md b/README.md
@@ -0,0 +1,13 @@
+# Wong et al 2021
+
+This is a MARS-seq experiment on samples:
+
+* VFG53_-BMP+BMP_1109
+* VFG83_-BMP+BMP_1109
+* VFG53_1025
+* VFG83_1025
+* VFG53_-BMP_1101
+* VFG83_-BMP_1101
+* VFG53_-BMP+FGF_1109
+* VFG83_-BMP+FGF_1109
+* ADE_111_1213
diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep
diff --git a/notebooks/00_preparation.Rmd b/notebooks/00_preparation.Rmd
@@ -0,0 +1,72 @@
+---
+title: "00 - preparation"
+author: "Martin Proks"
+date: '`r Sys.Date()`'
+knit: (function(inputFile, encoding) { 
+      rmarkdown::render(inputFile,
+                        encoding=encoding,
+                        output_format='all',
+                        output_dir='../reports/')})
+output:
+  # To create PDF report, uncomment below
+  # pdf_document:
+  #   toc: yes
+  html_document:
+    number_sections: yes
+    theme: yeti
+    toc: yes
+    toc_float: yes
+    df_print: paged
+    dev: png
+---
+
+```{r knitr, include = FALSE}
+DOCNAME = knitr::current_input()
+knitr::opts_chunk$set(autodep        = TRUE,
+                      cache          = FALSE,
+                      cache.path     = paste0("cache/", DOCNAME, "/"),
+                      cache.comments = TRUE,
+                      echo           = TRUE,
+                      error          = FALSE,
+                      fig.align      = "center",
+                      fig.path       = paste0("../reports/figures/", DOCNAME, "/"),
+                      fig.width      = 10,
+                      fig.height     = 8,
+                      message        = FALSE,
+                      warning        = FALSE)
+```
+
+# Find count matrix
+
+```{r}
+files <- stringr::str_sort(list.files(path = '../GEO/', pattern = "^AB", full.names = T), numeric = T)
+```
+
+# Concatenate counts
+
+```{r}
+counts <- data.frame(read.table(files[1], sep = "\t"))
+for (i in 2:length(files)) {
+  counts <- cbind(counts, read.table(files[i], sep = "\t"))
+}
+
+rownames(counts) <- unlist(lapply(rownames(counts), FUN = function(x) { return(strsplit(x, ';')[[1]][1]) }))
+```
+
+# Save counts
+
+```{r}
+data <- NULL
+data$counts <- counts
+data$metadata <- as.data.frame(readxl::read_excel("../data/corrected_fung_design_file.xlsx", sheet = "Sheet1", col_names = T)[-1])
+data$annot <- read.table("../data/gene-annotations.tsv", sep = "\t", header = T)
+rownames(data$metadata) <- data$metadata[,1]
+
+saveRDS(data, file = "../data/processed/MARS-Fung.RDS")
+```
+
+# Session info
+
+```{r}
+sessionInfo()
+```
diff --git a/notebooks/01_analysis.Rmd b/notebooks/01_analysis.Rmd
@@ -0,0 +1,252 @@
+---
+title: "Single Cell Analysis - human sample"
+author: "Martin Proks"
+date: '`r Sys.Date()`'
+knit: (function(inputFile, encoding) { 
+      rmarkdown::render(inputFile,
+                        encoding=encoding,
+                        output_format='all',
+                        output_dir='../reports/')})
+output:
+  # To create PDF report, uncomment below
+  pdf_document:
+    toc: yes
+  html_document:
+    number_sections: yes
+    theme: yeti
+    toc: yes
+    toc_float: yes
+    df_print: paged
+    dev: png
+---
+
+```{r knitr, include = FALSE}
+DOCNAME = knitr::current_input()
+knitr::opts_chunk$set(autodep        = TRUE,
+                      cache          = FALSE,
+                      cache.path     = paste0("cache/", DOCNAME, "/"),
+                      cache.comments = TRUE,
+                      echo           = TRUE,
+                      error          = FALSE,
+                      fig.align      = "center",
+                      fig.path       = paste0("../reports/figures/", DOCNAME, "/"),
+                      fig.width      = 10,
+                      fig.height     = 8,
+                      message        = FALSE,
+                      warning        = FALSE)
+```
+
+# Introduction
+
+Single-cell analysis of human dataset.
+
+# Load dataset
+
+```{r message=FALSE}
+library(dplyr)
+library(Seurat)
+library(ggplot2)
+library(patchwork)
+random_seed <- 12345
+
+data <- readRDS('../data/raw/MARS-Fung.RDS')
+raw_ann <- CreateSeuratObject(data$counts, meta.data = data$metadata)
+```
+
+## Quality plots
+
+```{r}
+raw_ann[['percent.mito']] <- PercentageFeatureSet(raw_ann, pattern = "^MT-")
+raw_ann[['percent.ercc']] <- PercentageFeatureSet(raw_ann, pattern = "^ERCC-")
+raw_ann[['percent.ribo']] <- PercentageFeatureSet(raw_ann, pattern = "^RP[LS]")
+```
+
+```{r}
+VlnPlot(raw_ann, 
+        features = c("nFeature_RNA", "nCount_RNA", "percent.mito", "percent.ribo"),
+        ncol = 4)
+```
+
+```{r}
+FeatureScatter(raw_ann, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "Stage")
+```
+
+## Filtering
+
+```{r}
+print(paste0("Before filtering: ", dim(raw_ann)[2], " cells ",  dim(raw_ann)[1], " genes"))
+```
+
+```{r}
+# Remove ERCC
+raw_ann <- raw_ann[rownames(raw_ann)[!grepl('ERCC-', rownames(raw_ann))], ]
+```
+
+```{r}
+# Remove Zero stage
+raw_ann <- raw_ann[, [email protected]$Stage != 'Zero']
+```
+
+```{r}
+nbins <- 100
+min_cells <- 2000
+max_cells <- 35e3
+min_genes <- 550
+max_genes <- 4950
+
+ggplot([email protected], aes(x=nCount_RNA)) + 
+  geom_histogram(bins = nbins) + 
+  geom_vline(aes(xintercept=min_cells), linetype="dashed", color='red') +
+  geom_vline(aes(xintercept=max_cells), linetype="dashed", color='red')
+
+ggplot([email protected], aes(x=nFeature_RNA)) + 
+  geom_histogram(bins = nbins) +
+  geom_vline(aes(xintercept=min_genes), linetype="dashed", color='red') +
+  geom_vline(aes(xintercept=max_genes), linetype="dashed", color='red')
+
+ggplot([email protected], aes(x=nCount_RNA, y=nFeature_RNA, color=percent.mito)) + 
+  geom_point() + 
+  scale_color_continuous(type = "viridis") +
+  geom_vline(aes(xintercept=min_cells), linetype="dashed", color='red') +
+  geom_vline(aes(xintercept=max_cells), linetype="dashed", color='red') +
+  geom_hline(aes(yintercept=min_genes), linetype="dashed", color='red') +
+  geom_hline(aes(yintercept=max_genes), linetype="dashed", color='red')
+```
+
+```{r}
+adata <- subset(raw_ann, subset = 
+                    nFeature_RNA > min_genes & nFeature_RNA < max_genes & 
+                    nCount_RNA > min_cells & nCount_RNA < max_cells & 
+                    percent.mito < 20)
+```
+
+```{r}
+adata <- CreateSeuratObject(adata@assays$RNA@counts, min.cells = 3, meta.data = [email protected])
+```
+
+## After filtering
+
+```{r}
+print(paste0("After filtering: ", dim(adata)[2], " cells ",  dim(adata)[1], " genes"))
+```
+
+```{r}
+VlnPlot(adata, 
+        features = c("nFeature_RNA", "nCount_RNA", "percent.mito", "percent.ribo"),
+        ncol = 4)
+```
+
+## Save dataset
+
+```{r}
+saveRDS(adata, file = "../data/processed/01_fung.filtered.RDS")
+```
+
+# Subset
+
+```{r}
+adata <- adata[, adata$Condition %in% c("VFG53_1025", "VFG53_-BMP_1101", "VFG53_-BMP+FGF_1109")]
+dim(adata)
+```
+
+# Normalization
+
+```{r message=FALSE, warning=FALSE}
+adata <- NormalizeData(adata)
+```
+
+# HVG
+
+```{r}
+
+adata <- FindVariableFeatures(adata, selection.method = "vst", nfeatures = 2000)
+top10 <- head(VariableFeatures(adata), 10)
+
+# plot variable features with and without labels
+plot1 <- VariableFeaturePlot(adata)
+plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
+plot1 + plot2
+```
+
+# Scale data
+
+```{r, message=FALSE}
+adata <- ScaleData(adata)
+```
+
+# Cell Cycle
+
+```{r}
+adata <- CellCycleScoring(adata, 
+                          s.features = cc.genes.updated.2019$s.genes, 
+                          g2m.features = cc.genes.updated.2019$g2m.genes, 
+                          set.ident = TRUE)
+```
+
+# PCA
+
+```{r}
+adata <- RunPCA(adata, npcs = 50)
+ElbowPlot(adata)
+DimPlot(adata, reduction = 'pca', group.by = 'Stage')
+DimPlot(adata, reduction = 'pca', group.by = 'Phase')
+```
+
+```{r}
+Idents(adata) <- 'Stage'
+commonR::plot_proportion_bar(adata, group.by = "Phase") + ggtitle("Cell cycle proportion per Stage")
+```
+
+# Clustering & UMAP (resolution 0.5)
+
+```{r, message=FALSE}
+adata <- FindNeighbors(adata, dims = 1:20)
+adata <- FindClusters(adata, random.seed = random_seed, resolution = 0.5)
+adata <- RunUMAP(adata, dims = 1:20, seed.use = random_seed)
+```
+
+```{r}
+DimPlot(adata, reduction = "umap")
+DimPlot(adata, reduction = "umap", group.by = 'Stage')
+DimPlot(adata, reduction = "umap", group.by = 'Phase')
+```
+
+```{r}
+markers <- FindAllMarkers(adata, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.5)
+markers %>%
+    group_by(cluster) %>%
+    slice_max(n = 10, order_by = avg_log2FC)
+```
+
+# Clustering & UMAP (resolution 0.6)
+
+```{r, message=FALSE}
+adata <- FindNeighbors(adata, dims = 1:20)
+adata <- FindClusters(adata, random.seed = random_seed, resolution = 0.6)
+adata <- RunUMAP(adata, dims = 1:20, seed.use = random_seed)
+```
+
+```{r}
+DimPlot(adata, reduction = "umap")
+DimPlot(adata, reduction = "umap", group.by = 'Stage')
+DimPlot(adata, reduction = "umap", group.by = 'Phase')
+```
+
+```{r}
+markers <- FindAllMarkers(adata, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.5)
+markers %>%
+    group_by(cluster) %>%
+    slice_max(n = 10, order_by = avg_log2FC)
+```
+
+# Save analysis
+
+```{r}
+saveRDS(adata, file = '../data/processed/01_fung.RDS')
+```
+
+# Session info
+
+```{r}
+sessionInfo()
+```