-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_processing_TCGA_RNA_data.R
90 lines (76 loc) · 3.06 KB
/
01_processing_TCGA_RNA_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#Step1: load TCGA tumor and normal data
library(gelnet)
library(dplyr)
library(biomaRt)
library(synapser)
synLogin("fraya","nyg789654")
# Load TCGA pancancer RNAseq data to crate tumor index
s <- synGet( "syn4976369", downloadLocation = "./data/pancan" )
# Auxiliary function: Reduces HUGO|POSITION gene IDs to just HUGO
f <- function( v ) unlist( lapply( strsplit( v, "\\|" ), "[[", 1 ) )
X <- read.delim( s$path, as.is=TRUE, check.names=FALSE ) %>% ## Read the raw values
filter( !grepl( "\\?", gene_id ) ) %>% ## Drop genes with no mapping to HUGO
mutate( gene_id = f( gene_id ) ) #%>% ## Clip gene ids to HUGO
#filter( gene_id %in% names(w) ) ## Reduce to the signature's gene set
## SLC35E2 has multiple entries with the same HUGO id
## Keep the first entry only
j <- grep( "SLC35E2", X[,1] )
if( length(j) > 1 )
X <- X[-j[-1],]
## Convert to a matrix
rownames(X) <- NULL
X <- X %>% tibble::column_to_rownames( "gene_id" ) %>% as.matrix()
## get pancancer metadata
library(TCGAbiolinks)
pandata<-PanCancerAtlas_subtypes()
pandata<-as.data.frame(pandata)
pandata$sample<-NULL
for (i in 1:length(pandata$pan.samplesID)){
pandata$sample[i]<-paste(unlist(strsplit( pandata$pan.samplesID[i], "-" ))[1],
unlist(strsplit( pandata$pan.samplesID[i], "-" ))[2],
unlist(strsplit( pandata$pan.samplesID[i], "-" ))[3],sep="-")
}
barcode<-colnames(X)
non_tumor<-grep("^TCGA-..-....-1[0-9].-...-....-..",barcode,value=T)
tumor<-grep("^TCGA-..-....-0[0-9].-...-....-..",barcode,value=T)
Tcga_Anno<-data.frame(barcode,type="tumor")
for (i in 1:length(barcode)){
if (Tcga_Anno[i,]$barcode %in% non_tumor){Tcga_Anno[i,]$type="non_tumor"}
}
#Step2: divide the training set and test set;
set.seed(1234)
# remove the NA data
X <- na.omit(X)
tumor_barcode <- Tcga_Anno[Tcga_Anno$type=="tumor",1];
non_tumor_barcode <- Tcga_Anno[Tcga_Anno$type=="non_tumor",1];
training_set_barcode<-sample(tumor_barcode,as.integer(0.75*length(tumor_barcode)))
test_set_barcode<-setdiff(tumor_barcode,training_set_barcode)
training_set_data <- X[,training_set_barcode]
test_set_data <- X[,test_set_barcode]
non_tumor_data <- X[,non_tumor_barcode]
validation_data <- cbind(test_set_data,non_tumor_data)
#Step3: Calculate the CV and filter genes
cal_cv=function(x){
y=na.omit(x)
return(sd(y)/mean(y))
}
gene_CV<-apply(training_set_data, 1, cal_cv)
# density plot
data<-as.data.frame(gene_CV);
range(data$gene_CV)
library(ggplot2)
pdf("./gene_cv.pdf",width=10,height=4)
ggplot(data, aes(x=gene_CV)) + xlab("gene_CV")+
geom_density(alpha=.25) + theme_classic()
dev.off()
data$gene<-rownames(data)
#select gene_cv>1 as cutoff
gene_hCV<-data[data$gene_CV>1,2]
#the high CV gene correlation matrix
training_set_data<-training_set_data[which(rownames(training_set_data)%in%gene_hCV),]
cor_data<-cor(t(training_set_data));
write.csv(cor_data,"highCV_gene_correalation.csv")
write.csv(validation_data,"validation_data.csv")
write.csv(training_set_data,"training_set_data.csv")
write.csv(test_set_data,"test_set_data.csv")
write.csv(Tcga_Anno,"Tcga_Anno_info.csv")