-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTranscriptional_Pausing.R
72 lines (67 loc) · 3.09 KB
/
Transcriptional_Pausing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Set current working directory
setwd("YOUR/PATH/TO/SRC/FOLDER")
# Initialize all accompanying scripts
source("init.R")
source("data_preprocessing.R")
source("data_processing.R")
source("data_analysis.R")
source("helper_functions.R")
# Specify number of available cores for low, average and high load computations
CARGS$workers <- c(low = 6, avg = 12, high = 18)
WORKERS <- list(low = NULL, avg = NULL, high = NULL)
# Initialize parallel workers
WORKERS$low <- init.grid(CARGS$workers["low"])
WORKERS$avg <- init.grid(CARGS$workers["avg"])
WORKERS$high <- init.grid(CARGS$workers["high"])
#### Data Pre-Processing
# Process gene annotations
GENE.ANNOT <- parse.gene.annotations()
# Process gene quantifications
GENE.QUANT <- load.RNAseq()
# Process CAGE transcription start sites [unfinished]
CTSS <- load.CAGE.tss(ncores = CARGS$workers["avg"])
# Process CHIP-seq data sets
chipseq.files <- retrieve.CHIPseq.experiments()
CHIPseq <- parse.CHIPseq.experiments(chipseq.files)
# Process eCLIP-seq data sets
eclipseq.files <- retrieve.eCLIPseq.experiments()
eCLIPseq <- parse.eCLIPseq.experiments(eclipseq.files)
# Process housekeeping gene annotations
HK.GENES <- load.house.keeping.genes()
# Retrieve sequence specific factors
SEQ.SPEC <- retrieve.sequence.specific.factors()
# Load CpG Island Annotations
CPG.ISLANDS <- load.cpg.island()
#### Data Processing
# Retrieve valid coding transcripts
valid.coding.transcripts <- retrieve.valid.coding.transcripts()
# Retrieve valid non-coding transcripts
valid.non.coding.transcripts <- retrieve.valid.non.coding.transcripts()
# Calculate traveling ratios for protein coding transcripts
traveling.ratios <- calculate.traveling.ratio()
# Retrieve 7SK-ncRNA transcripts
RN7SK.ANNOT <- load.7SKncRNA.data()
# Identify novel 7SK-ncRNA binding proteins
novel.rn7sk.binders <- identify.7SK.binders(RN7SK.ANNOT, eCLIPseq)
# Calculate relatedness of 7SK transcript variants
rn7sk.phylogeny()
# Group transcripts by bound RNA-binding proteins (RBPs)
tx.by.rbp <- get.target.transcripts(eCLIPseq, name = "tx.by.rbp.eclip")
# Group transcripts by bound DNA-binding proteins (DBPs)
tx.by.dbp <- get.target.transcripts(CHIPseq, name = "tx.by.dbp.chip")
#### Data Analysis
# Build feature vectors for predictive models
feature.vectors <- build.feature.vector()
# Build feature matrices for predictive models
model.matrices <- build.model.matrices(feature.vectors, exclude = c(".*Quant$"))
# Retrain models on local machine with optimized parameter setup
model.results <- train.xgb.models(model.matrices)
# Run random models
random.model.results <- train.randomized.xgb.models()
#### Model interpretations
# Interpret model results (create paper figures)
evaluate.feature.effects(model.results = model.results,
feature.subspace = "All",
model.target = "target_traveling.ratio",
plot.base.size = 16)
create.supplementary.tables()