forked from mvab/mendelian-randomization-breast-cancer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_process_gwas_summary.Rmd
140 lines (118 loc) · 5.1 KB
/
01_process_gwas_summary.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
---
title: "Format data and extract instruments"
author: "Marina Vabistsevits"
date: "21/05/2020"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(vroom)
library(dplyr)
library(TwoSampleMR)
```
```{r}
# set path for pre-calculated data, outside the code repo
# `local` / `remote` (reading data from RDSF)
currently_working_env = "local"
source("set_paths.R")
set_paths(currently_working_env)
# metadata file that is itiratelely read in and updated
data_lookup<-paste0("metadata/data_lookup.csv")
# functions
source("functions.R")
```
```{r}
# supl functions
read_and_format <-function(file_gwas, data_version="ieu_gwas_pipeline"){
# different versions of data formats to read in
if (data_version == "ieu_gwas_pipeline"){
# data produced by IEU GWAS pipeline
out <-vroom(file_gwas,
col_select = c("SNP","BETA","SE","ALLELE1","ALLELE0","A1FREQ","P_BOLT_LMM_INF")) %>%
format_data(., type="outcome",
snp_col = "SNP",
beta_col = "BETA",
se_col = "SE",
effect_allele_col = "ALLELE1",
other_allele_col = "ALLELE0",
eaf_col = "A1FREQ",
pval_col = "P_BOLT_LMM_INF")
} else if (data_version == "Lagou2019"){
# format of Lagou2019 data from MAGIC coonsortium
out <- vroom(file_gwas,
col_select = c("snp","effect_allele","other_allele","eaf_hapmap_CEU",
"female_beta", "female_se" ,"female_pvalue")) %>%
format_data(., type="outcome",
snp_col = "snp",
beta_col = "female_beta",
se_col = "female_se",
effect_allele_col = "effect_allele",
other_allele_col = "other_allele",
eaf_col = "eaf_hapmap_CEU",
pval_col = "female_pvalue")
} else if (data_version == "Wheeler2017"){
# format of Wheeler2017 data from MAGIC coonsortium
out <- vroom(file_gwas,
col_select = c("snp","effect_allele","other_allele","eaf_hapmap_CEU",
"beta", "stderr" ,"pvalue")) %>%
format_data(., type="outcome",
snp_col = "snp",
beta_col = "beta",
se_col = "stderr",
effect_allele_col = "effect_allele",
other_allele_col = "other_allele",
eaf_col = "eaf_hapmap_CEU",
pval_col = "pvalue")
}
return(out)
}
extract_tophits <- function(outcome_gwas){
outcome_gwas %>%
filter(pval.outcome < 5e-8) %>%
convert_outcome_to_exposure() %>%
# using local reference
#clump_data_local(., local_path)
# using MR-base API
clump_data(., clump_r2 = 0.001)
}
```
```{r}
# specify the source of full summary stats GWAS file
data_source <- "ieu_gwas_pipeline" # ieu_gwas_pipeline / Lagou2019 / Wheeler2017
traits <- read_csv(data_lookup) %>% filter(source == data_source) %>% pull(trait)
```
# Process every file in a standard way:
# - Format GWAs data into outcome format and save as `GWAS_tidy_outcome.txt.gz`
# - Extract instruments and save as `tophits.tsv`
```{r message=F}
tidy_gwas <- "_GWAS_tidy_outcome.txt.gz"
tidy_tophits <- "_tophits.tsv"
for (current_trait in traits) {
gwas_filename<- read_csv(data_lookup) %>% filter(trait == current_trait) %>% pull(original_file)
file_gwas <- paste0(data_path_gwas_raw, gwas_filename)
print(paste0("Processing: ", current_trait, ", ", gwas_filename))
gwas_outcome_format<-read_and_format(file_gwas, data_version = data_source)
gwas_outcome_format$outcome <- current_trait
print(" -> finished formatting")
data_name <- paste0(read_csv(data_lookup) %>% filter(trait == current_trait) %>% pull(trait_file_name))
print(paste0("Saving tidy GWAS outcome file to: ", data_path_gwas, data_name, tidy_gwas))
vroom_write(gwas_outcome_format, paste0(data_path_gwas, data_name, tidy_gwas ))
print("Extracting tophits")
#gwas_outcome_format<-vroom(paste0(data_path_gwas, data_name, tidy_gwas )) # if need to run from here
tophits <- extract_tophits(gwas_outcome_format)
if (!exists("tophits")) { stop("Extarcting instruments failed!")}
print(paste0("Found ", dim(tophits)[1], " SNPs at < 5e-8"))
write_tsv(tophits, paste0(data_path_tophits, data_name, tidy_tophits))
print("Saved tophits file")
rm(gwas_outcome_format)
rm(tophits)
# update lookup file
read_csv(data_lookup) %>%
mutate(full_data = ifelse(trait == current_trait, paste0(data_name, tidy_gwas), full_data)) %>%
mutate(tophits_data = ifelse(trait == current_trait, paste0(data_name, tidy_tophits), tophits_data)) %>%
mutate(format = ifelse(trait == current_trait, "tsv", format)) %>%
write_csv(data_lookup)
print("Updated data lookup file")
}
```