Skip to content

Commit

Permalink
uploading all clean codes
Browse files Browse the repository at this point in the history
  • Loading branch information
Arruda committed Oct 17, 2023
1 parent 36f1d55 commit a3efe2c
Show file tree
Hide file tree
Showing 19 changed files with 4,378 additions and 0 deletions.
33 changes: 33 additions & 0 deletions add_id.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#---------------------- Insert ID -------------------------#
awk 'BEGIN{FS=","; OFS=","}
function custom_sort(i1,v1,i2,v2){
l1=length(v1); l2=length(v2);
if (l1 == l2) {
return (v1 > v2)? 1:-1
} else {
return l2 - l1
}
} NR==1 { $0=$0 FS "ID" }
NR>1 { a[1]=$11; a[2]=$12; asort(a,b,"custom_sort");
$(NF+1) = sprintf("%s:%s_%s_%s",$1,$2,b[1],b[2])
}1' T2D_european_noID.txt > T2D_european.txt | column -t


#------------------ Insert ID in multiple files ---------------------#
for i in basalganglia-EUR-30 cortex-AFR-40 cortex-EAS-30 cortex-EUR-80 hippocampus-EUR-30;
do awk 'BEGIN{FS=","; OFS=","}
function custom_sort(i1,v1,i2,v2){
l1=length(v1); l2=length(v2);
if (l1 == l2) {
return (v1 > v2)? 1:-1
} else {
return l2 - l1
}
} NR==1 { $0=$0 FS "ID" }
NR>1 { a[1]=$8; a[2]=$9; asort(a,b,"custom_sort");
$(NF+1) = sprintf("%s:%s_%s_%s",$10,$11,b[1],b[2])
}1' eQTL_${i}_hg19.csv > eQTL_${i}.csv;
done



436 changes: 436 additions & 0 deletions causal_inference/MR_geneexp.R

Large diffs are not rendered by default.

435 changes: 435 additions & 0 deletions causal_inference/MR_vanilla.R

Large diffs are not rendered by default.

471 changes: 471 additions & 0 deletions causal_inference/mr_chil_adult_bmi.R

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions colocalization/get_T2D_SCZ_sumstats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
library(dplyr)

project_folder <- "/lustre/groups/itg/teams/zeggini/projects/SCZ_T2D/"
source("/project_data/scripts/read_files_config.R")

region <- data.table::fread(paste0(output.path, "GWAS_regions.csv")) %>% .[CHR!=23]
credset <- data.table::fread(paste0(output.path, "credible_set.csv"))
credset[, SNP:=ID]
result <- data.table::fread(paste0(output.path, "final_coloc_result_pp4.csv"))

read.data <- function(data.path, gwas.coloc.result, chr.col="CHR", pos.col="POS", range=1e+6){
dt <- data.table::fread(data.path)
res.dt <- data.table::data.table()

for(i in 1:nrow(gwas.coloc.result)){
reg <- gwas.coloc.result[i]
res.dt <- rbind(res.dt, dt[get(chr.col)==reg$CHR
& data.table::inrange(get(pos.col), reg$POS-range, reg$POS+range)])
}
return(res.dt)
}

gwas <- lapply(traits, function(i){read.data(data.path=paste0(output.path, "GWAS_", i, "_precoloc.txt"), gwas.coloc.result=result, chr.col="chr", pos.col="pos")})
names(gwas) <- traits

data.table::fwrite(gwas[[1]], paste0(data.path, "T2D_gtex_coloc.txt"))
data.table::fwrite(gwas[[2]], paste0(data.path, "SCZ_gtex_coloc.txt"))

# Get regions for colocalization
region <- data.table::fread(paste0(output.path, "GWAS_regions.csv")) %>% .[CHR!=23]
result <- data.table::fread(paste0(output.path, "final_coloc_result_pp4.csv"))
range=1e+6
coloc.regions <- result[, .(region, CHR, POS, lead.SNP, rsID)]
coloc.regions[, `:=`(start=POS-range, end=POS+range)]

data.table::fwrite(coloc.regions, paste0(output.path, "coloc_regions.csv"))

# Mapping data
mapping.file <- data.table::fread("data/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz")
mapping.file <- data.table::data.table()

for (i in 1:nrow(coloc.regions)){
pos <- coloc.regions[i, POS]
chr <- coloc.regions[i, CHR]
mapping.dt <- rbind(mapping.dt, mapping.file[sub("chr", "", CHR)==chr & data.table::inrange(POS, pos-range, pos+range)])
}

data.table::fwrite(mapping.dt, "data/GTEX_mapping_file.txt")


354 changes: 354 additions & 0 deletions colocalization/gwas_coloc.R

Large diffs are not rendered by default.

302 changes: 302 additions & 0 deletions colocalization/gwas_gtex_coloc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
#####################################################################################
#---------------------------------- Functions --------------------------------------#
#####################################################################################
plot.hyprcoloc <- function(traits, tissue, qtl.type, gene, data, reg, res, plot.path, genes.info, ld.path=NULL, range=1e+6){
sel.snp <- res$results$candidate_snp[1]
sel.chr <- as.integer(gsub(":.*", "", sel.snp))
sel.pos <- as.integer(gsub("_.*", "", gsub(".*:", "", sel.snp)))
sel.rsid <- data[ID==sel.snp, snp.t1]

# Get data
trait1.region <- data[chr==sel.chr & dplyr::between(pos, sel.pos-range, sel.pos+range) & !is.null(snp.t1),
.(CHR=chr, SNP=snp.t1, P=pval.t1, BP=pos, logP=-log10(as.numeric(pval.t1)))]
trait2.region <- data[chr==sel.chr & dplyr::between(pos, sel.pos-range, sel.pos+range),
.(CHR=chr, SNP=snp.t2, P=pval.t2, BP=pos, logP=-log10(as.numeric(pval.t2)))]
qtl.region <- data[chr==sel.chr & dplyr::between(pos, sel.pos-range, sel.pos+range),
.(CHR=chr, SNP=snp.t1, P=pvalue, BP=pos, logP=-log10(as.numeric(pvalue)))]

# Get LD matrix
# ld.file <- data.table::fread(paste0(ld.path, "region", reg , "_LD.ld")) %>%
# .[SNP_A==sel.rsid | SNP_B==sel.rsid] %>% .[SNP_B==sel.rsid, c("SNP_B", "SNP_A") := .(SNP_A, SNP_B)]
ld.file <- NA

# Regional association plot
data.lst <- list(trait1.region, trait2.region, qtl.region)
names(data.lst) <- c(traits[[1]], traits[[2]], paste(qtl.type, tissue, gene, sep="_"))
locus.zoom(data = data.lst,
offset_bp = range,
genes.data = genes.info,
file.name = paste0(plot.path, paste(traits[1], traits[2], reg, qtl.type, tissue, gene, sep="_"), ".png"),
#secondary.snp = ifelse(credible.set<2, NA, credible.set),
snp=sel.rsid,
ignore.lead=TRUE,
#ld.file=ld.file,
pp="PP4",
pp.value=round(res$results$posterior_prob[1], digits=3),
nplots=TRUE)
}

request_datasets_from_api <- function(study_id = "",
quant_method = "",
sample_group = "",
tissue_id = "",
study_label = "",
tissue_label = "",
condition_label = "") {
size = 1000 #Page size
start = 0 #Page start

parameter_values = c(study_id,quant_method,sample_group,tissue_id,study_label,
tissue_label,condition_label)
parameter_names = c('study_id','quant_method','sample_group','tissue_id',
'study_label','tissue_label','condition_label')

while (T) {
URL = glue("https://www.ebi.ac.uk/eqtl/api/v2/datasets/?size={size}&start={start}")

#Adding defined parameters to the request
for (i in 1:length(parameter_values)) {
par = parameter_values[i]
par_name = parameter_names[i]
if (par != "")
URL = glue("{URL}&{par_name}={par}")
}

r <- GET(URL, accept_json())
cont <- content(r, "text", encoding = "UTF-8")

# If the request was unsuccessful
if (status_code(r) != 200) {
#If we get no results at all, print error
if (start == 0) {
print(glue("Error {status_code(r)}"))
print(cont)
return ()
}
#else just break
break
}

cont_df <- fromJSON(cont)

if (start == 0) {
responses <- cont_df
}
else{
responses <- rbind(responses, cont_df)
}
start <- start + size
}
return(responses)
}

request_associations_around_position <- function(dataset_id, position, chromosome_id, offset = 500000){
size = 1000
start = 0
range_start = position - offset
range_end = position + offset


while (TRUE){
URL = glue("https://www.ebi.ac.uk/eqtl/api/v2/datasets/{dataset_id}/associations?size={size}&start={start}&pos={chromosome_id}:{range_start}-{range_end}")

r <- GET(URL, accept_json())
cont <- content(r, "text", encoding = "UTF-8")

if (status_code(r) != 200) {
# Loop will break if the request was unsuccessful
if(start==0) {
print(glue("Error {status_code(r)}"))
print(cont)
return()}
break
}


cont_df <- fromJSON(cont)

if (start == 0){
responses <- cont_df
}
else{
responses <- rbind(responses, cont_df)
}
start <- start + size
}
return(responses)
}

get.credset <- function(snpscores, data, reg, gene, tissue, qtl.type, value=0.95){
sorted <- sort(snpscores, decreasing=TRUE)
cs <- cumsum(sorted)
w <- which(cs > value)[1]
credset <- data.table::data.table(region=reg,
geneID=gene,
qtl.type=qtl.type,
tissue=tissue,
ID=names(sorted)[1:w],
SNP.PP4=sorted[1:w],
rsID=data[ID %in% names(sorted)[1:w], snp.t1],
chr=data[ID %in% names(sorted)[1:w], chr],
pos=data[ID %in% names(sorted)[1:w], pos])
return(credset)
}

library(httr)
library(jsonlite)
library(glue)
library(rtracklayer)
library(dplyr)
library(hyprcoloc)

setwd("C:/Users/ana.arruda/OneDrive - Helmholtz Zentrum München/Projects/SCZ_T2D/")

# source("scripts/read_files_config.R")
source("scripts_bckup/plot_functions.R")
GRCh37_Genes <- read.delim("data/UCSC_GRCh37_Genes_UniqueList.txt", stringsAsFactors = FALSE, header = TRUE)

t2d <- unique(data.table::fread("data/T2D_gtex_coloc.txt"))
scz <- unique(data.table::fread("data/SCZ_gtex_coloc.txt"))

tissues.qtl <- data.table::fread("tissue_qtl.csv")
tissues.gtex <- tissues.qtl[source=="GTEx"]

# Get dataset ID from eQTL catalogue
tissues.gtex[qtl.type=="eqtl", dataset.id:=request_datasets_from_api(quant_method="ge", sample_group=tolower(tissue))$dataset_id, by=tissue]
# tissues.gtex[qtl.type=="sqtl", dataset.id:=request_datasets_from_api(quant_method="txrev", sample_group=tolower(tissue))$dataset_id, by=tissue]
tissues.gtex[qtl.type=="sqtl", dataset.id:=request_datasets_from_api(quant_method="leafcutter", sample_group=tolower(tissue))$dataset_id, by=tissue]
tissues.gtex[qtl.type=="tqtl", dataset.id:=request_datasets_from_api(quant_method="txrev", sample_group=tolower(tissue))$dataset_id, by=tissue]

# Define the regions of interest
coloc.regions <- data.table::fread("gwas_coloc/final_coloc_result_pp4.csv")

# Lift over regional data from build 37 to build 38
print("lifting over genomic risk coordinates for coloc region")
system("gzip -d data/hg19ToHg38.over.chain.gz")
ch <- import.chain("data/hg19ToHg38.over.chain")
for (i in 1:nrow(coloc.regions)){
regions.b37 <- GRanges(coloc.regions[i, .(CHR, start=POS, end=POS)])
seqlevelsStyle(regions.b37) = "UCSC" # necessary
# regions.b38 <- data.table::as.data.table(unlist(liftOver(regions.b37, ch)))
coloc.regions[i, POS.b38:=data.table::as.data.table(unlist(liftOver(regions.b37, ch)))$start]
}

# mapping.file <- data.table::fread("data/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz")

# Perform colocalization, filtering, or any other analysis as needed
res.dt <- data.table::data.table()
credset.dt <- data.table::data.table()

for (i in 15:nrow(tissues.gtex)){
t <- tissues.gtex[i]

for (j in 1:nrow(coloc.regions)){
reg <- coloc.regions[j]

# Get data
data <- merge(t2d[chr==reg$CHR & data.table::inrange(pos, reg$POS-1e6, reg$POS+1e6)],
scz[chr==reg$CHR & data.table::inrange(pos, reg$POS-1e6, reg$POS+1e6)],
by=c("ID", "chr", "pos"), allow.cartesian=T, suffixes=c(".t1", ".t2"))
data[snp.t1=="", snp.t1:=snp.t2]
data[snp.t2=="", snp.t2:=snp.t1]

# Fetch the tissue-specific eQTL data from the API for all genes in the region (b38)
qtl.data <- rbind(request_associations_around_position(t$dataset.id, reg$POS.b38-500000, reg$CHR),
request_associations_around_position(t$dataset.id, reg$POS.b38+500000, reg$CHR))
qtl.data <- unique(data.table::as.data.table(qtl.data))

if(nrow(qtl.data)==0) {
print("No SNPs in qtl data")
next
}

qtl.data <- qtl.data[, chromosome:=as.integer(chromosome)]

# Lift down data from build 38 to build 37
print("Lifting down genomic risk coordinates of qtl data")
path = system.file(package="liftOver", "extdata", "hg38ToHg19.over.chain")
ch = import.chain(path)
qtl.data.b38 <- GRanges(qtl.data[, `:=` (start=position, end=position)])
seqlevelsStyle(qtl.data.b38) = "UCSC" # necessary
qtl.data <- data.table::as.data.table(unlist(liftOver(qtl.data.b38, ch)))
qtl.data[, `:=`(POS.b37=start, POS.b38=position, chromosome=as.integer(sub("chr", "", seqnames)))]
qtl.data[, `:=`(start=NULL, end=NULL, seqnames=NULL, width=NULL, strand=NULL, position=NULL)]
qtl.data <- qtl.data[nchar(ref)==1 & nchar(alt)==1]

# for (gene in unique(data$gene_id)){
for (gene in unique(qtl.data$gene_id)){
tmp.data <- merge(data, qtl.data[gene_id==gene], by.x=c("chr", "pos"), by.y=c("chromosome", "POS.b37"))

if(nrow(tmp.data)==0) {
print("No SNPs in common between GWAS and qtl data")
next
}

# Flip alleles based on reference GWAS
tmp.data[, `:=` (ea=ifelse(alt==ea.t1, alt, ref),
nea=ifelse(alt==ea.t1, ref, alt),
beta=ifelse(alt==ea.t1, beta, -beta)), by=seq_len(nrow(tmp.data))]
tmp.data[, `:=`(alt=NULL, ref=NULL)]

# ----------- Prepare beta and ses matrices -----------
print("Preparing input matrices")
ses <- tmp.data[gene_id==gene, .(ID, rsID=snp.t1, se.t1, se.t2, se)]
colnames(ses) <- c("ID", "rsID", "t2d", "scz", paste(t$qtl.type, t$tissue, gene, sep="_"))
betas <- tmp.data[gene_id==gene, .(ID, rsID=snp.t1, beta.t1, beta.t2, beta)]
colnames(betas) <- c("ID", "rsID", "t2d", "scz", paste(t$qtl.type, t$tissue, gene, sep="_"))

# ----------- Run HyPrColoc -----------
print("Colocalization analysis using HyPrColoc")
id <- betas$ID
rsid <- betas$rsID
betas_mat <- as.matrix(betas[, c('ID', 'rsID'):=NULL])
rownames(betas_mat) <- id
ses_mat <- as.matrix(ses[, c('ID', 'rsID'):=NULL])
rownames(ses_mat) <- id
binary.traits = c(1,1,0)
betas_mat <- na.omit(betas_mat)
ses_mat <- na.omit(ses_mat)

if (nrow(betas_mat)>1){
res <- hyprcoloc::hyprcoloc(betas_mat, ses_mat, trait.names=colnames(betas_mat), snp.id=id, bb.alg=FALSE,
binary.outcomes=binary.traits, prior.1=1e-4, prior.c=0.05, snpscores = TRUE)


if(res$results$posterior_prob>0.8) {
print(paste("Plotting for", t$qtl.type, "in", t$tissue, "and", gene, sep=" "))
plot.hyprcoloc(traits=c("t2d", "scz"), t$tissue, t$qtl.type, gene, unique(tmp.data[gene_id==gene], by="snp.t1"),
reg$region, res, "gwas_coloc/qtl_plots/", genes.info=GRCh37_Genes)

# get 95% credible set
credset.dt <- rbind(credset.dt, get.credset(snpscores=res$snpscores,
data=tmp.data[gene_id==gene],
reg=reg$region,
gene=gene,
tissue=t$tissue,
qtl.type=t$qtl.type))

res.dt <- rbind(res.dt, data.table::data.table(region=reg$region,
geneID=gene,
qtl.type=t$qtl.type,
tissue=t$tissue,
posterior_prob=res$results$posterior_prob,
candidate_snp=res$results$candidate_snp,
rsID=tmp.data[ID==res$results$candidate_snp & gene_id==gene]$snp.t1,
posterior_explained_by_snp=res$results$posterior_explained_by_snp,
qtl.beta=tmp.data[ID==res$results$candidate_snp & gene_id==gene]$beta,
qtl.se=tmp.data[ID==res$results$candidate_snp & gene_id==gene]$se,
qtl.pval=tmp.data[ID==res$results$candidate_snp & gene_id==gene]$pval), fill=TRUE)
}
}
}
}
}

res.dt <- res.dt[order(region)]
res.dt[, geneID:=gsub("(.*)\\..*", "\\1", gsub(".*_(.*)", "\\1", geneID))]
data.table::fwrite(res.dt, "gwas_coloc/qtl_gtex_hyprcoloc_results.csv")
data.table::fwrite(credset.dt, "gwas_coloc/qtl_gtex_hyprcoloc_credible_set.csv")




Loading

0 comments on commit a3efe2c

Please sign in to comment.