Load Data

Load single cell dataset

library(tidyverse)

Compute sum of UMIs

Cerebellum ce ll types

cerebellum <- read_tsv("../../Data/Lake2018/GSE97930_CerebellarHem_snDrop-seq_UMI_Count_Matrix_08-01-2017.txt")
annot <- tibble(cell=colnames(cerebellum)[-1],cell_type=gsub("_cbm[0-9]_[ATGCN]+$","",colnames(cerebellum)[-1]))

cerebellum <- gather(cerebellum,cell,UMI,Gran_cbm1_TTAATCAGTCGC:Mic_cbm9_ATGGTCGGAGAC,-Gene)
cerebellum <- inner_join(cerebellum,annot)

cerebellum <- cerebellum %>% group_by(Gene,cell_type) %>% summarise(sumUMI=sum(UMI))
write_tsv(cerebellum,path = "../Data/Lake2018/GSE97930_CerebellarHem_snDrop-seq_UMI_Count_Matrix_08-01-2017_sumUMI.txt")

Frontal cortex cell types

fc <- read_tsv("../../Data/Lake2018/GSE97930_FrontalCortex_snDrop-seq_UMI_Count_Matrix_08-01-2017.txt")

fc <- gather(fc,cell,UMI,Ex1_fcx8_GGACGCCTTTAA:Mic_fcx13_GGGAGTAACTTG,-Gene)
cell_type1M <- gsub("_fcx[0-9]{1,2}_[ATGCN]+$","",fc$cell[1:100000000])
cell_type2M <- gsub("_fcx[0-9]{1,2}_[ATGCN]+$","",fc$cell[100000001:200000000])
cell_type3M <- gsub("_fcx[0-9]{1,2}_[ATGCN]+$","",fc$cell[200000001:353993295])
cell_types <- c(cell_type1M,cell_type2M,cell_type3M)
fc$cell_type <- cell_types

fc <- fc %>% group_by(Gene,cell_type) %>% summarise(sumUMI=sum(UMI))
write_tsv(fc,path = "../../Data/Lake2018/GSE97930_FrontalCortex_snDrop-seq_UMI_Count_Matrix_08-01-2017_sumUMI.txt")

Visaul cortex cell types

vc <- read_tsv("../../Data/Lake2018/GSE97930_VisualCortex_snDrop-seq_UMI_Count_Matrix_08-01-2017.txt")

vc <- gather(vc,cell,UMI,Ex1_occ17_AAGTGAGTGACC:Mic_occ24_ATTAATTGAAGA,-Gene)
cell_type1M <- gsub("_occ[0-9]{1,2}_[ATGCN]+$","",vc$cell[1:100000000])
cell_type2M <- gsub("_occ[0-9]{1,2}_[ATGCN]+$","",vc$cell[100000001:200000000])
cell_type3M <- gsub("_occ[0-9]{1,2}_[ATGCN]+$","",vc$cell[200000001:300000000])
cell_type4M <- gsub("_occ[0-9]{1,2}_[ATGCN]+$","",vc$cell[300000001:400000000])
cell_type5M <- gsub("_occ[0-9]{1,2}_[ATGCN]+$","",vc$cell[400000001:500000000])
cell_type6M <- gsub("_occ[0-9]{1,2}_[ATGCN]+$","",vc$cell[500000001:600000000])
cell_type7M <- gsub("_occ[0-9]{1,2}_[ATGCN]+$","",vc$cell[600000001:700000000])
cell_type8M <- gsub("_occ[0-9]{1,2}_[ATGCN]+$","",vc$cell[700000001:804566088])


cell_types <- c(cell_type1M,cell_type2M,cell_type3M,cell_type4M,cell_type5M,cell_type6M,cell_type7M,cell_type8M)
vc$cell_type <- cell_types

vc <- vc %>% group_by(Gene,cell_type) %>% summarise(sumUMI=sum(UMI))
write_tsv(vc,path = "../../Data/Lake2018/GSE97930_VisualCortex_snDrop-seq_UMI_Count_Matrix_08-01-2017_sumUMI.txt")

Load computed UMI for each tissue

cerebellum <- read_tsv("../../Data/Lake2018/GSE97930_CerebellarHem_snDrop-seq_UMI_Count_Matrix_08-01-2017_sumUMI.txt")

## Parsed with column specification:
## cols(
##   Gene = col_character(),
##   cell_type = col_character(),
##   sumUMI = col_double()
## )

cerebellum <- mutate(cerebellum,tissue="cerebellum")

fc <- read_tsv("../../Data/Lake2018/GSE97930_FrontalCortex_snDrop-seq_UMI_Count_Matrix_08-01-2017_sumUMI.txt")

## Parsed with column specification:
## cols(
##   Gene = col_character(),
##   cell_type = col_character(),
##   sumUMI = col_double()
## )

fc <- mutate(fc,tissue="Frontal Cortex")

vc <- read_tsv("../../Data/Lake2018/GSE97930_VisualCortex_snDrop-seq_UMI_Count_Matrix_08-01-2017_sumUMI.txt")

## Parsed with column specification:
## cols(
##   Gene = col_character(),
##   cell_type = col_character(),
##   sumUMI = col_double()
## )

vc <- mutate(vc,tissue="Visual Cortex")

Join the cell types from the three tissues

exp <- rbind(cerebellum,fc,vc)

Get sum of UMI for each gene and each cell type with the same name

exp_lvl5 <- exp %>% group_by(Gene,cell_type) %>% summarise(sumUMI=sum(sumUMI)) %>% ungroup()

Tidy the data

exp_lvl5 <- exp_lvl5 %>% spread(cell_type,sumUMI)
exp_lvl5[is.na(exp_lvl5)] <- 0
exp_lvl5 <- gather(exp_lvl5,cell_type,sumUMI,Ast:Purk2)

Load gene coordinates

Load gene coordinates and extend upstream and downstream by 100kb.

File downloaded from MAGMA website (https://ctg.cncr.nl/software/magma).

Filtered to remove extended MHC (chr6, 25Mb to 34Mb).

gene_coordinates <- read.table("../../Data/NCBI/NCBI37.3.gene.loc.extendedMHCexcluded",header=F,stringsAsFactors = F) %>% 
  mutate(start=ifelse(V3-100000<0,0,V3-100000),end=V4+100000,V1=as.character(V1)) %>% 
  select(2,start,end,1) %>% 
  as.tibble() %>% 
  rename(chr="V2", ENTREZ="V1") %>% 
  mutate(chr=paste0("chr",chr))

## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.

Write dictonary for cell type names

exp_lvl5 <- exp_lvl5 %>% ungroup() %>% rename(Lvl5=cell_type,Expr_sum_mean=sumUMI)
dic_lvl5 <- select(exp_lvl5,Lvl5) %>% unique() %>% mutate(makenames=make.names(Lvl5))
write_tsv(dic_lvl5,"../../Data/Lake2018/dictionary_cell_type_names.txt")

Remove not expressed genes

genes_to_remove <- exp_lvl5 %>% group_by(Gene) %>% summarise(sum=sum(Expr_sum_mean)) %>% filter(sum==0)
exp_lvl5 <- filter(exp_lvl5,!Gene%in%genes_to_remove$Gene)

QC

Remove not expressed genes

genes_to_remove <- exp_lvl5 %>% 
  group_by(Gene) %>% 
  summarise(sum=sum(Expr_sum_mean)) %>% 
  filter(sum==0)
exp_lvl5 <- filter(exp_lvl5,!Gene%in%genes_to_remove$Gene)

Scale to 1 million molecules

Each cell type is scaled to the same total number of molecules.

exp_lvl5 <- exp_lvl5 %>% 
  group_by(Lvl5) %>% 
  mutate(Expr_sum_mean=Expr_sum_mean*1e6/sum(Expr_sum_mean))

Specificity Calculation

The specifitiy is defined as the proportion of total expression performed by the cell type of interest (x/sum(x)).

Lvl5

exp_lvl5 <- exp_lvl5 %>% group_by(Gene) %>% 
  mutate(specificity=Expr_sum_mean/sum(Expr_sum_mean))

Keep only ENTREZ genes protein coding and MAGMA tested genes

entrez2symbol <- AnnotationDbi::toTable(org.Hs.eg.db::org.Hs.egSYMBOL2EG) %>% rename(Gene="symbol",ENTREZ="gene_id")

## 

exp_lvl5 <- inner_join(exp_lvl5,entrez2symbol,by="Gene") 
exp_lvl5 <- inner_join(exp_lvl5,gene_coordinates,by="ENTREZ")

Get number of genes

Get number of genes that represent 10% of the dataset

n_genes <- length(unique(exp_lvl5$ENTREZ))
n_genes_to_keep <- (n_genes * 0.1) %>% round()

Save expression profile for other processing

save(exp_lvl5,file = "expression.ready.Rdata")

Functions

Get MAGMA input top10%

magma_top10 <- function(d,Cell_type){
  d_spe <- d %>% group_by_(Cell_type) %>% top_n(.,n_genes_to_keep,specificity) 
  d_spe %>% do(write_group_magma(.,Cell_type))
}

write_group_magma  = function(df,Cell_type) {
  df <- select(df,Lvl5,ENTREZ)
  df_name <- make.names(unique(df[1]))
  colnames(df)[2] <- df_name  
  dir.create(paste0("MAGMA/"), showWarnings = FALSE)
  select(df,2) %>% t() %>% as.data.frame() %>% rownames_to_column("Cat") %>%
  write_tsv("MAGMA/top10.txt",append=T)
return(df)
}

Get LDSC input top 10%

write_group  = function(df,Cell_type) {
  df <- select(df,Lvl5,chr,start,end,ENTREZ)
  dir.create(paste0("LDSC/Bed"), showWarnings = FALSE,recursive = TRUE)
  write_tsv(df[-1],paste0("LDSC/Bed/",make.names(unique(df[1])),".bed"),col_names = F)
return(df)
}

ldsc_bedfile <- function(d,Cell_type){
  d_spe <- d %>% group_by_(Cell_type) %>% top_n(.,n_genes_to_keep,specificity) 
  d_spe %>% do(write_group(.,Cell_type))
}

Write MAGMA/LDSC input files

Filter out genes with expression below 1 TPM.

exp_lvl5 %>% filter(Expr_sum_mean>1) %>% magma_top10("Lvl5")

## Warning: group_by_() is deprecated. 
## Please use group_by() instead
## 
## The 'programming' vignette or the tidyeval book can help you
## to program with group_by() : https://tidyeval.tidyverse.org
## This warning is displayed once per session.

exp_lvl5 %>% filter(Expr_sum_mean>1) %>% ldsc_bedfile("Lvl5")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

get_Lake_input.md

get_Lake_input.md

Load Data

Load single cell dataset

Compute sum of UMIs

Load computed UMI for each tissue

Load gene coordinates

Write dictonary for cell type names

QC

Scale to 1 million molecules

Specificity Calculation

Lvl5

Keep only ENTREZ genes protein coding and MAGMA tested genes

Get number of genes

Save expression profile for other processing

Functions

Get MAGMA input top10%

Get LDSC input top 10%

Write MAGMA/LDSC input files

Files

get_Lake_input.md

Latest commit

History

get_Lake_input.md

File metadata and controls

Load Data

Load single cell dataset

Compute sum of UMIs

Load computed UMI for each tissue

Load gene coordinates

Write dictonary for cell type names

QC

Scale to 1 million molecules

Specificity Calculation

Lvl5

Keep only ENTREZ genes protein coding and MAGMA tested genes

Get number of genes

Save expression profile for other processing

Functions

Get MAGMA input top10%

Get LDSC input top 10%

Write MAGMA/LDSC input files