Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cutoff selection #2

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
606 changes: 0 additions & 606 deletions LSEA.py

This file was deleted.

230 changes: 230 additions & 0 deletions LSEA_2.4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
#!/usr/bin/env python3

import argparse
import subprocess
import sys
import os
import csv
import json
import shutil
from collections import defaultdict
from typing import Counter
from scipy.stats import hypergeom
from rpy2 import robjects
from utils import get_overlapping_features, count_intervals


def tsv_len(tsv_file):
with open(tsv_file) as f:
for i, l in enumerate(f):
pass
return i


def run_plink(plink_path, bfile_path, tsv_file, input_dict, p, out_name):
# print('Calculating independent loci with PLINK')
tsv_plink = os.getcwd() + f"/{out_name}/" + \
tsv_file.split('/')[-1].split('.')[0] + '_for_plink.tsv'
out_plink = os.getcwd() + f"/{out_name}/" + \
tsv_file.split('/')[-1].split('.')[0]
with open(tsv_plink, 'w', newline='') as csvfile:
my_writer = csv.writer(csvfile, delimiter='\t')
init_row = ["SNP", "Chr", "Pos", "P"]
my_writer.writerow(init_row)
for snp in input_dict:
# Only extract necessary info from dictionary corresponding to csv
row = [snp] + input_dict[snp][0:3]
my_writer.writerow(row)
subprocess.call('{0}/plink --bfile {1} --clump {2} --clump-field P --clump-p1 {3} --clump-p2 0.01 --clump-r2 0.1 --clump-snp-field SNP --clump-kb 500 --out {4} --allow-no-sex --allow-extra-chr \
2> {5}'.format(plink_path, bfile_path, tsv_plink, p, out_plink, f'./{out_name}/PLINK_clumping.log'),
shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# subprocess.call(f'md5sum {out_plink}.clumped', shell=True)
return out_plink + ".clumped"


def normalize_path(path):
if path[-1] == "/":
path = path[:-1]
return path


def get_snp_info(tsv_file, names):
input_dict = defaultdict(list)
names = list(map(lambda x: x.lower(), names))
with open(tsv_file, 'r', newline='') as csvfile: # Convert tsv to dictionary
my_reader = csv.reader(csvfile, delimiter='\t')
csv_headings = list(map(lambda x: x.lower(), next(my_reader)))
try: # We find necessary indices from header
chr_index = csv_headings.index(names[0])
pos_index = csv_headings.index(names[1])
id_index = csv_headings.index(names[2])
p_index = csv_headings.index(names[3])
except ValueError:
print("Check that your tsv file has headers and they are correct!")
exit(1)
for row in my_reader: # Start from second row
input_dict[row[id_index]] = [row[chr_index]] + \
[row[pos_index]] + [row[p_index]] # Name -> Chromosome, pos, pval
return input_dict


def make_bed_file(clumped_file, interval, out_name):
# print(f'Reading clumped file {clumped_file}')
with open(f"./{out_name}/clumps.bed", 'w', newline='') as bed_file: # Here we write to new file
my_writer = csv.writer(bed_file, delimiter='\t')
with open(clumped_file, 'r') as cl_file: # Our result of clumping (SNPs sets)
my_reader = csv.reader(cl_file, delimiter='\t')
for row in my_reader:
# print('Reading line')
if len(row) != 0:
# What to do with NONE?
row = list(filter(lambda x: len(x) !=
0 and x != " ", row[0].split(" ")))
if row[0] == "CHR": # Skip first row
continue
# print('Got here')
bed_row = [row[0], max(int(row[3]) - interval, 0), int(row[3]) + interval]
my_writer.writerow(bed_row)
# Sort file
subprocess.call(
f"bedtools sort -i ./{out_name}/clumps.bed > ./{out_name}/clumps_sorted.bed", shell=True)
# Merge regions
subprocess.call(
f"bedtools merge -i ./{out_name}/clumps_sorted.bed > ./{out_name}/merged.bed", shell=True)
with open(f"./{out_name}/merged_fixed_size.bed", 'w', newline='') as bed_file: # Here we write to new file
my_writer = csv.writer(bed_file, delimiter='\t', lineterminator='\n')
with open(f'./{out_name}/merged.bed', 'r', newline='') as inter:
my_reader = csv.reader(inter, delimiter='\t')
for row in my_reader:
middle_point = int(row[1]) + (int(row[2]) - int(row[1])) // 2
new_row = [row[0], max(middle_point - interval, 0), middle_point + interval]
my_writer.writerow(new_row)
# Add numeration to intervals
subprocess.call(
"awk {'print $0\"\t\"FNR'}" + f" ./{out_name}/merged_fixed_size.bed > ./{out_name}/merged_with_line_numbers.bed", shell=True)


def p_val_for_gene_set(n_big, k_big, n, k):
return hypergeom.sf(k, n_big, k_big, n)


def calcuate_qvals(pvals):
x = robjects.r('''
f <- function(pvals) {
p.adjust(pvals, method = "fdr")
}
''')
r_f = robjects.globalenv['f']
v = robjects.FloatVector(pvals)
return list(r_f(v))


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='LSEA')
parser.add_argument('-input', help='Input file in TSV format. The file should contain at least four columns: chromosome name, variant position, variant ID, and GWAS p-value. The file MUST have a header row.', metavar='file',
type=str, required=True)
parser.add_argument('-universe', help='JSON file with universe. Several universe files may be specified separated by space',
metavar='path', nargs='+', type=str, required=True)
parser.add_argument('-out', help='Relative path to output directory (Default: lsea_result). Will be created.',
metavar='name', type=str, required=False, default='lsea_result')
parser.add_argument('-p', help='p-value cutoff to be used when identifying associated loci. If not specified, optimal cutoff will be estimated using a regression model (at least -n and -m options should be given)',
metavar='float', nargs='+', required=False, default=['1e-5', '5e-8'])
parser.add_argument('-plink_dir', help='Path to a directory with PLINK executable', metavar='dir',
type=str, required=False)
parser.add_argument('-bfile', help='Genotypes in PLINK .bed format that will be used for LD-based clumping. Only file prefix should be given.', metavar='prefix',
type=str, required=False)
parser.add_argument('-qval_threshold', help='Q-value threshold for output (Default: 0.1)',
metavar='float', type=float, required=False, default="0.1")
parser.add_argument('-column_names', help='Column names for input TSV. These names will be used if the column names do not match the default: chr, pos, id, p. Names should be given in the same order (chromosome, position, ID, p-value)',
metavar='name', nargs=4, type=str, required=False)

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()

print(f'INFO\tRunning LSEA with the following CMD: {" ".join(sys.argv[:])}')

tsv_file = args.input
path_to_plink_dir = args.plink_dir
if path_to_plink_dir is not None:
path_to_plink_dir = normalize_path(args.plink_dir)
path_to_bfile = args.bfile
qval_thresh = args.qval_threshold
col_names = args.column_names
json_files = args.universe
p_cutoffs = [float(x) for x in args.p]
out_name = args.out
if col_names is None:
col_names = ["chr", "pos", "id", "p"]
print(f'INFO\tReading input file {tsv_file}...')
input_dict = get_snp_info(tsv_file, col_names)

if os.path.exists(f'./{out_name}'):
print(f'WARN\tOutput diretory {out_name} exists, removing...')
shutil.rmtree(f'./{out_name}')
os.makedirs(f'./{out_name}')

for universe_file in json_files:
universe_name = os.path.basename(universe_file).replace('.json', '')
print(f'INFO\tProcessing universe {universe_name}')
universe= json.load(open(universe_file, "r"))
interval = universe["interval"]
with open(f'./{out_name}/features.bed', 'w') as feature_file:
for feature in universe["features"]:
bed_line = '\t'.join(universe["features"][feature])
bed_line = bed_line.replace('chr', '')
feature_file.write(f'{bed_line}\n')
interval_counts_for_universe = universe["interval_counts"]
print(f'INFO\tUniverse stats:')
print(f'\tinterval size = {interval};\n\tinterval count = {universe["universe_intervals_number"]};')
print(f'\ttotal number of features = {len(universe["features"])};')
print(f'\tfeature sets in universe = {len(universe["gene_set_dict"])}')

stats_file = open(f"./{out_name}/annotation_stats_{universe_name}.tsv", 'w')
print('p_cutoff\tnum_loci\tannotated_loci\tunambiguous_annotations\tsignificant_hits\tmin_qval', file=stats_file)

for p_cutoff in p_cutoffs:
print(f'INFO\tCalculating enrichment with p-value cutoff = {p_cutoff}')
clumped_file = run_plink(path_to_plink_dir, path_to_bfile, tsv_file, input_dict, p_cutoff, out_name)
make_bed_file(clumped_file, interval, out_name)
n_intervals = sum(1 for line in open(f'./{out_name}/merged_with_line_numbers.bed'))
target_features = get_overlapping_features(f"./{out_name}/merged_with_line_numbers.bed",
f'./{out_name}/features.bed', f"./{out_name}/inter.tsv")
feature_set = universe["gene_set_dict"]
interval_counts = count_intervals(feature_set, target_features)

pvals = []
for w in sorted(interval_counts, key=lambda item: len(interval_counts[item]), reverse=True):
pvals.append(p_val_for_gene_set(universe["universe_intervals_number"],
interval_counts_for_universe[w], n_intervals, len(interval_counts[w])))
qvals = calcuate_qvals(pvals)

with open(f"./{out_name}/{universe_name}_result_{p_cutoff}.tsv", 'w', newline='') as file:
explained_loci = set()
feature_names = defaultdict(set)
hit_count = 0
min_qval = 1
my_writer = csv.writer(file, delimiter='\t')
my_writer.writerow(["Gene_set", "Overlapping_loci", "Description", "p-value", "q-value"])
for i, w in enumerate(sorted(interval_counts, key=lambda item: len(interval_counts[item]), reverse=True)):
if len(interval_counts[w]) >= 3 and qvals[i] <= qval_thresh:
min_qval = min(min_qval, qvals[i])
hit_count += 1
for locus in interval_counts[w]:
explained_loci.add(locus)
feature_names[locus].add(';'.join(interval_counts[w][locus]))
row = [w, len(interval_counts[w]), dict(interval_counts[w]), pvals[i], qvals[i]]
my_writer.writerow(row)
# Calculating number of loci with ambiguous annotations
ambiguous_loci = 0
for _, feature_set in feature_names.items():
if len(feature_set) > 1:
expanded_set = sum([ftr.split(';') for ftr in feature_set], [])
feature_freq = dict(Counter(expanded_set))
if max(feature_freq.values()) < len(feature_set):
ambiguous_loci += 1
unambiguous = len(explained_loci) - ambiguous_loci
print(f'{p_cutoff}\t{n_intervals}\t{len(explained_loci)}\t{unambiguous}\t{hit_count}\t{min_qval}', file=stats_file)

stats_file.close()
124 changes: 24 additions & 100 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,113 +1,37 @@
# LSEA
LSEA (Locus Set Enrichment Analysis) is a tool for performing gene set enrichment analysis on independent loci, taking into account LD (Linkage Disequilibrium).
# LSEA v0.2.4

## Getting Started
*This is a new version of the tool that is now being extensively benchmarked. Please note that this is a development repository. Please use the contents of this repository with caution*

LSEA could be applied for gene set enrichment analysis for data obtained from GWAS-summary statistics files in tsv-format. It is based on simple hypergeometric test, however it transforms genes and gene sets into independant loci and sets of independant loci to eliminate multiple signals from genes in LD to enhance analysis precision.
<br>Tool includes precompiled universe of independant loci based on data, obtained from UK Biobank (https://www.ukbiobank.ac.uk/). Data for all heritable phenotypes (based on partitioned heritability p-value < 0.05) were processed with PLINK to get indepedant loci for each phenotype. After that all files were combined into universe with mearging intervals overlaping more than 60%.
## Prerequisites
- plink v1.9
- bedtools
- scipy for Python3
- rpy2 for Python3

### Prerequisites
<ul>
<li>python (3.7 or higher) </li>
<li>scipy (1.0.0 or higher)
</li>
<li>pandas (0.17.1 or higher)
</li>
<li>numpy (1.14.1 or higher)
</li>
<li>PLINK (1.07 or higher) - http://zzz.bwh.harvard.edu/plink/
</li>
<li>SnpEff (4.3T or higher) - http://snpeff.sourceforge.net/
</li>
</ul>
## Usage

### Installing

To install this tool clone this repository to your PC.
This version of LSEA substitutes the p-value cutoff model with the new approach to select optimal cutoff for your data. LSEA will create result files for multiplt cutoffs and write statistics file to select the optimal parameter value.

To run full pipeline:
```
~$ git clone https://github.com/LSEA
python3 LSEA_2.3.py -input ./data/in.tsv \
-plink_dir <plink_folder_path> \
-bfile <.bed file for PLINK> \
-universe ./data/universe.json
```

## Running and using tool

Firstly, you need to prepare tsv-file from GWAS summary statistics with the following structure: <br>
<table>
<tr>
<td>CHR</td>
<td>COORDINATE</td>
<td>RSID</td>
<td>REF</td>
<td>ALT</td>
<td>PVAL</td>
</tr>
<tr>
<td>9</td>
<td>136058188</td>
<td>rs12216896</td>
<td>C</td>
<td>T</td>
<td>2.89651e-11</td>
</tr>
</table>

To launch this tool you will also need to specify path to PLINK and SnpEff directories.

## Example usage
```
~$ python3 LSEA.py -af <input tsv-file> -sn <path to SNPeff> -pld <path to plink> -bf <bfile for plink> -p
```
This command will apply LSEA algorithm to the input file and will generate tsv-file with the following structure:
<table>
<tr>
<td>gene_set</td>
<td>p-value</td>
<td>q-value</td>
<td>enrich_description</td>
</tr>
<tr>
<td>BIOCARTA_INTRINSIC_PATHWAY</td>
<td>2.0446237642438122e-14</td>
<td>2.2517441515617103e-10</td>
<td>(17776, 11, 36, 6, 'F11;FGB;FGA;F5;FGG;KLKB1')</td>
</tr>
</table>
The first column contains the name of the set, the second and the third represent p-value and corrected q-value of hypergeometric test, the last coloumn includes information about total number of independant loci, number of loci in quiery, number of loci in gene set, number of loci common for quiery and gene set and, finally, the genes list.<br>
Note that the genes list could be smaller then the number of common loci, because only indepedant loci are counted for analysis. <br>
-p (--precompiled flag) points that precompiled universe of independant loci based on UK Biobank data is used.<br>
Information about HLA-locus is excluded from analisys due to high ambiguity of LD-scores within the HLA-locus.
All results will be written to the `lsea_result` directory which will be created for you. If you want to change the location of the result files, please specify a relative path using the `-out` option. By defaul, LSEA will perform enrichment analysis using two p-value cutoffs (`1e-05, 5e-08`). If you want to test more cutoffs, please specify them using the `-p` option sepeated by space.

## Tool options:
To generate universe from BED and GMT files, run:
```
-af <input.tsv> Input file in tsv-format
-vf <input.vcf> Annotated vcf-file if it is already prepared
-pl <input.clumped> PlINK result of clumping (.clumped file) if it is already prepared
--precompiled, -p Use precompiled loci
-sn <path to SnpEff directory> Path to SnpEff
-g <genome> Flag for specifying genome for SnpEff annotation
-pld <path to PlINK directory> Path to PlINK
-bf <bfile> Bfile for PLINK
python3 universe_generator.py -variants ./data/positions.tsv \
-features ./data/gencode_formatted.bed \
./data/c2.all.v7.0.symbols.gmt \
-interval 100000 -o universe.json
```

## Creating your own universe:
If you don't want to use precompiled universe of independant loci you can use options for creating your own universe based on GWAS summary statictics files. Use -cu (--create_universe) option to create universe of independant loci from your data:
```
-af <input.tsv> -cu
```
For this function you have to prepare results of clumping for your GWAS data (obtain .clumped file). If you have multiple
files (e.g. for different phenotypes) use -cld <directory> flag for specifying directory with clumped files:

To generate universe using a directory with per-set BED files, run:
```
-cld <directory with clumped files>
```


## Author

* **Anton Shikov** - *Initial work* - [anton-shikov](https://github.com/anton-shikov)


## License

This project is free and available for everyone.

python3 universe_generator.py -variants ./data/positions.tsv \
-feature_files_dir <dir with BED files> \
-interval 100000 -o universe_dir.json
Binary file added __pycache__/utils.cpython-38.pyc
Binary file not shown.
Loading