diff --git a/README.md b/README.md index ca64494..a9c764d 100755 --- a/README.md +++ b/README.md @@ -26,7 +26,6 @@ PeakPlotter has has non-python dependencies. In order to run PeakPlotter you need to install the following tools and add the executables to your `PATH`: * Plink 1.9 or newer ([available here](https://www.cog-genomics.org/plink2/index)) * LocusZoom Standalone 1.4 or newer ([available here](http://genome.sph.umich.edu/wiki/LocusZoom_Standalone)) -* BedTools ([available here](http://bedtools.readthedocs.io/en/latest/)) * Tabix ([available here](https://github.com/samtools/htslib)) * Moreutils (for `sponge`) diff --git a/Singularity b/Singularity index 680f9c9..5da1ce3 100755 --- a/Singularity +++ b/Singularity @@ -42,15 +42,7 @@ From: ubuntu:18.04 # Bedtools apt install -y python-pip apt-get install -y zlib1g zlib1g-dev firefox python-dev emacs - - cd /opt - git clone https://github.com/arq5x/bedtools2.git - cd bedtools2 - # gwava needs a version without "sam header" error messages - git checkout tags/v2.27.1 - make && make install pip install scipy pandas numpy scikit-learn==0.14.1 - pip install pybedtools # PLINK cd /opt diff --git a/peakplotter/__init__.py b/peakplotter/__init__.py index e9e0716..93cf1da 100755 --- a/peakplotter/__init__.py +++ b/peakplotter/__init__.py @@ -1,8 +1,3 @@ #!/usr/bin/env python3 -from pathlib import Path - -__version__ = '0.3.0' - - -PLOTPEAKS_SCRIPT = str(Path(__file__).parent.joinpath('plotpeaks.sh')) +__version__ = '0.3.1' diff --git a/peakplotter/_interactive_manh.py b/peakplotter/_interactive_manh.py index f3ff1b9..cd38e8f 100755 --- a/peakplotter/_interactive_manh.py +++ b/peakplotter/_interactive_manh.py @@ -1,12 +1,27 @@ +import sys import json from typing import Tuple, Union, List, Dict import requests import pandas as pd -from peakplotter import helper from peakplotter.data import CENTROMERE_B37, CENTROMERE_B38 + +def get_build_server(build: Union[int, str]) -> str: + B38_SERVER = "https://rest.ensembl.org" + B37_SERVER = "http://grch37.rest.ensembl.org" + mapper = { + 'b38': B38_SERVER, + '38': B38_SERVER, + 38: B38_SERVER, + 'b37': B37_SERVER, + '37': B37_SERVER, + 37: B37_SERVER + } + return mapper[build] + + def _query(url, headers = None): if headers is None: headers = dict() @@ -96,17 +111,18 @@ def make_resp(snps: pd.DataFrame, pheno: pd.DataFrame) -> pd.DataFrame: return resp[['rs', 'ps', 'consequence', 'pheno']] -def get_rsid_in_region(chrom, start, end, server): - print(f"[DEBUG] get_variants_in_region({chrom}, {start}, {end}, {server}") +def get_rsid_in_region(chrom, start, end, server, logger): + + logger.debug(f"get_variants_in_region({chrom}, {start}, {end}, {server}") snps = get_variants_in_region(chrom, start, end, server) - print(f"[DEBUG] get_phenos_in_region({chrom}, {start}, {end}, {server}") + logger.debug(f"get_phenos_in_region({chrom}, {start}, {end}, {server}") pheno = get_phenos_in_region(chrom, start, end, server) resp = make_resp(snps, pheno) return resp -def query_vep(chrom: pd.Series, pos: pd.Series, a1: pd.Series, a2: pd.Series, server: str) -> List[Dict]: +def query_vep(chrom: pd.Series, pos: pd.Series, a1: pd.Series, a2: pd.Series, server: str, logger) -> List[Dict]: chrom = chrom.astype(str) pos = pos.astype(int).astype(str) a1 = a1.astype(str) @@ -118,12 +134,12 @@ def query_vep(chrom: pd.Series, pos: pd.Series, a1: pd.Series, a2: pd.Series, se r = requests.post(server+ext, headers=headers, data=data) if not r.ok: - print(data) + logger.error(data) r.raise_for_status() return r.json() - -def _get_csq_novel_variants(e: pd.DataFrame, chrcol: str, pscol: str, a1col: str, a2col: str, server: str) -> pd.DataFrame: +# TODO: Merge with get_csq_novel_variants function +def _get_csq_novel_variants(e: pd.DataFrame, chrcol: str, pscol: str, a1col: str, a2col: str, server: str, logger) -> pd.DataFrame: """ This function assumes that the input DataFrame object `e` has the following columns: - ps @@ -139,7 +155,7 @@ def _get_csq_novel_variants(e: pd.DataFrame, chrcol: str, pscol: str, a1col: str novelsnps=copied_e.loc[(copied_e['ensembl_rs']=="novel") & (copied_e['ld']>0.1) & (copied_e['ensembl_consequence']!='double allele'),] if novelsnps.empty: return copied_e - jData = query_vep(novelsnps[chrcol], novelsnps[pscol], novelsnps[a1col], novelsnps[a2col], server) + jData = query_vep(novelsnps[chrcol], novelsnps[pscol], novelsnps[a1col], novelsnps[a2col], server, logger) csq = pd.DataFrame(jData) @@ -149,11 +165,39 @@ def _get_csq_novel_variants(e: pd.DataFrame, chrcol: str, pscol: str, a1col: str copied_e['ensembl_consequence'].replace('_', ' ') return copied_e +# TODO: Merge with _get_csq_novel_variants function +def get_csq_novel_variants(e, chrcol, pscol, a1col, a2col, server, logger): + copied_e = e.copy() + copied_e.loc[(copied_e['ensembl_rs']=="novel") & (copied_e[a1col]==copied_e[a2col]),'ensembl_consequence']='double allele' + novelsnps=copied_e.loc[(copied_e['ensembl_rs']=="novel") & (copied_e['ld']>0.1) & (copied_e['ensembl_consequence']!='double allele'),] + if novelsnps.empty: + return copied_e + novelsnps['query']=novelsnps[chrcol].astype(str)+" "+novelsnps[pscol].astype(int).astype(str)+" . "+novelsnps[a1col].astype(str)+" "+novelsnps[a2col].astype(str)+" . . ." + request='{ "variants" : ["'+'", "'.join(novelsnps['query'])+'" ] }' + ext = "/vep/homo_sapiens/region" + headers={ "Content-Type" : "application/json", "Accept" : "application/json"} + logger.info("\t\t\tšŸŒ Querying Ensembl VEP (POST) :"+server+ext) + r = requests.post(server+ext, headers=headers, data=request) + + if not r.ok: + logger.error("headers :"+request) + r.raise_for_status() + sys.exit(1) + + jData = json.loads(r.text) + csq=pd.DataFrame(jData) + + + for _, row in csq.iterrows(): + copied_e.loc[copied_e['ps']==row['start'],'ensembl_consequence']=row['most_severe_consequence'] + + copied_e['ensembl_consequence'].replace('_', ' ') + return copied_e -def get_overlap_genes(chrom, start, end, server) -> pd.DataFrame: +def get_overlap_genes(chrom, start, end, server, logger) -> pd.DataFrame: url = f'{server}/overlap/region/human/{chrom}:{start}-{end}?feature=gene' - helper.info("\t\t\tšŸŒ Querying Ensembl overlap (Genes, GET) :"+url) + logger.info("\t\t\tšŸŒ Querying Ensembl overlap (Genes, GET) :"+url) decoded = _query(url) df = pd.DataFrame(decoded).fillna('') diff --git a/peakplotter/helper.py b/peakplotter/helper.py deleted file mode 100755 index 5672b16..0000000 --- a/peakplotter/helper.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 - -import re -import sys -import json -import subprocess -import urllib.request -import urllib.parse -from typing import Union - -import requests -import pandas as pd - -# number by which to multiply the normal number of requests to Ensembl -# Increase if lots of 504 Timeout errors -ENSEMBL_USELESSNESS_COEFFICIENT=3 - -def info(*strs): - outstr="[INFO]" - for string in strs: - outstr+=" "+str(string) - print(outstr) - return - - -def warn(*strs): - outstr="[WARNING]" - for string in strs: - outstr+=" "+str(string) - print(outstr, file=sys.stderr) - return - - -class GeneCoordinates: - chrom=0 - start=0 - end=0 - gene_id="" - name="" - def __init__(self, chrom, start, end, gene_id, name): - self.chrom = chrom - self.start = start - self.end = end - self.gene_id=gene_id - self.name=name - - def extend(self, margin): - self.start-=int(margin) - self.end+=int(margin) - - -def get_build_server(build: Union[int, str]) -> str: - B38_SERVER = "https://rest.ensembl.org" - B37_SERVER = "http://grch37.rest.ensembl.org" - mapper = { - 'b38': B38_SERVER, - '38': B38_SERVER, - 38: B38_SERVER, - 'b37': B37_SERVER, - '37': B37_SERVER, - 37: B37_SERVER - } - return mapper[build] - - -def get_csq_novel_variants(e, chrcol, pscol, a1col, a2col, server): - copied_e = e.copy() - copied_e.loc[(copied_e['ensembl_rs']=="novel") & (copied_e[a1col]==copied_e[a2col]),'ensembl_consequence']='double allele' - novelsnps=copied_e.loc[(copied_e['ensembl_rs']=="novel") & (copied_e['ld']>0.1) & (copied_e['ensembl_consequence']!='double allele'),] - if novelsnps.empty: - return copied_e - novelsnps['query']=novelsnps[chrcol].astype(str)+" "+novelsnps[pscol].astype(int).astype(str)+" . "+novelsnps[a1col].astype(str)+" "+novelsnps[a2col].astype(str)+" . . ." - request='{ "variants" : ["'+'", "'.join(novelsnps['query'])+'" ] }' - ext = "/vep/homo_sapiens/region" - headers={ "Content-Type" : "application/json", "Accept" : "application/json"} - info("\t\t\tšŸŒ Querying Ensembl VEP (POST) :"+server+ext) - r = requests.post(server+ext, headers=headers, data=request) - - if not r.ok: - print("headers :"+request) - r.raise_for_status() - sys.exit() - - jData = json.loads(r.text) - csq=pd.DataFrame(jData) - - - for index,row in csq.iterrows(): - copied_e.loc[copied_e['ps']==row['start'],'ensembl_consequence']=row['most_severe_consequence'] - - copied_e['ensembl_consequence'].replace('_', ' ') - return copied_e - - -def get_coordinates(gene_name, server): - ''' - Function to return the genomic coordinates of a gene submitted (GRCh37) - output data: chromosome, start, end, stable ID - ''' - - ext = "/lookup/symbol/homo_sapiens/%s?expand=0" % (gene_name) - - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) - - if not r.ok: - r.raise_for_status() - sys.exit() - - decoded = r.json() - gc=GeneCoordinates(int(decoded["seq_region_name"]), int(decoded["start"]), int(decoded["end"]), decoded["id"], gene_name) - return(gc) - - -def get_rsid_in_region(chrom, start, end, server): - url = server+'/overlap/region/human/'+str(chrom)+":"+str(start)+"-"+str(end)+'?feature=variation;content-type=application/json;' - info("\t\t\tšŸŒ Querying Ensembl with region "+url) - response = urllib.request.urlopen(url).read().decode('utf-8') - jData = json.loads(response) - snps = pd.DataFrame(jData) - snps['location'] = snps.seq_region_name.map(str)+":"+snps.start.map(str)+"-"+snps.end.map(str) - - url = server+'/phenotype/region/homo_sapiens/'+str(chrom)+":"+str(start)+"-"+str(end)+'?feature_type=Variation;content-type=application/json;' - info("\t\t\tšŸŒ Querying Ensembl with region "+url) - response = urllib.request.urlopen(url).read().decode('utf-8') - jData = json.loads(response) - pheno=pd.DataFrame(jData) - #print(pheno['phenotype_associations']) - pheno['pheno']="" - pheno['location']="" - for index, variant in pheno.iterrows(): - for assoc in variant.phenotype_associations: - if assoc['source'] != 'COSMIC': - try: - variant.pheno=";".join(set(variant.pheno.split(";").append(assoc['description']))) - except TypeError: - variant.pheno=assoc['description'] - - #variant.pheno=assoc['description'] if (variant.pheno=="") else variant.pheno+";"+assoc['description']; - - variant.location=assoc['location'] - resp=pd.merge(snps, pheno, on='location', how='outer') - if pheno.empty==True: - resp.drop(["alleles", "assembly_name", "clinical_significance", "feature_type", "end", "seq_region_name", "strand", "source", "location"], axis=1, inplace=True) - resp.rename(columns = {'start':'ps', 'id':'rs', 'consequence_type':'consequence'}, inplace = True) - else: - resp.drop(["alleles", "assembly_name", "clinical_significance", "feature_type", "end", "seq_region_name", "phenotype_associations", "strand", "source", "id_y", "location"], axis=1, inplace=True) - resp.rename(columns = {'start':'ps', 'id_x':'rs', 'consequence_type':'consequence'}, inplace = True) - resp.dropna(inplace=True, subset=["rs"]) - return(resp) - - -def fetch_single_point(gc, sp_results): - c=gc.chrom - start=gc.start - end=gc.end - sp = pd.DataFrame() - task = subprocess.Popen(["tabix", "-h", sp_results, str(c)+":"+str(start)+"-"+str(end)], stdout=subprocess.PIPE) - sp=pd.read_table(task.stdout) - task = subprocess.Popen(["zgrep", "-m1", "chr", sp_results], stdout=subprocess.PIPE) - sp.columns=task.stdout.read().decode('UTF-8').split() - return(sp) - - -def read_variants_from_gene_set(gc, input_monster): - c=gc.chrom - variants=pd.read_table(input_monster) - variants.columns=[w.replace('chr'+str(c)+"_", '') for w in variants.columns] - variants.columns=[re.sub("_.*", '', w) for w in variants.columns] - variants.drop(variants.columns[[0,1]], axis=1, inplace=True) - variants=variants.transpose() - variants['ps']=variants.index - variants.index=range(variants.count()[0]) - if (len(variants.columns)==2): - variants.columns=["weight", "ps"] - else: - #Sometimes runs have no weights - variants.columns=["ps"] - variants['weight']=1 - variants.ps=pd.to_numeric(variants.ps, errors='coerce') - return(variants) diff --git a/peakplotter/interactive_manh.py b/peakplotter/interactive_manh.py index 9b8b519..9affd98 100755 --- a/peakplotter/interactive_manh.py +++ b/peakplotter/interactive_manh.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import sys -import logging import numpy as np import pandas as pd @@ -12,11 +11,9 @@ from bokeh.plotting import figure, save from bokeh.palettes import Spectral10 -from peakplotter import helper # TODO: Change this back to relative import after we replace plotpeaks.sh to a python equivalent. -from peakplotter import _interactive_manh +from . import _interactive_manh -def interactive_manh(file, pvalcol, pscol, rscol, mafcol, chrcol, a1col, a2col, build: str): - logging.basicConfig() +def interactive_manh(file, pvalcol, pscol, mafcol, chrcol, a1col, a2col, build: str, logger): outfile=file+".html" pd.set_option('display.max_columns', 500) @@ -78,10 +75,10 @@ def interactive_manh(file, pvalcol, pscol, rscol, mafcol, chrcol, a1col, a2col, start = int(e[pscol].min()) end = int(e[pscol].max()) ## We get the list of rsids and phenotype associations in the region - server = helper.get_build_server(build) - print(f"[DEBUG] get_rsid_in_region({chrom}, {start}, {end}, {server}") - ff = _interactive_manh.get_rsid_in_region(chrom, start, end, server) - print(ff.head()) + server = _interactive_manh.get_build_server(build) + logger.debug(f"get_rsid_in_region({chrom}, {start}, {end}, {server}, logger") + ff = _interactive_manh.get_rsid_in_region(chrom, start, end, server, logger) + logger.debug(ff.head()) columns = ['ensembl_rs', 'ps', 'ensembl_consequence', 'ensembl_assoc'] ff.columns = columns @@ -97,7 +94,7 @@ def interactive_manh(file, pvalcol, pscol, rscol, mafcol, chrcol, a1col, a2col, dedup_ff[columns], grouped_dup_ff[columns] ]).sort_values('ps').reset_index(drop = True) - print(e.head()) + logger.debug(e.head()) # Merge dataframes, drop signals that are not present in the dataset emax=pd.merge(e, grouped_ff, on='ps', how='outer') @@ -116,13 +113,13 @@ def interactive_manh(file, pvalcol, pscol, rscol, mafcol, chrcol, a1col, a2col, e['col_assoc']=e['col_assoc'].astype(int) # ENSEMBL consequences for variants in LD that do not have rs-ids - print(f"[DEBUG] e=helper.get_csq_novel_variants(e, '{chrcol}', '{pscol}', '{a1col}', '{a2col}', '{server}')") - e=helper.get_csq_novel_variants(e, chrcol, pscol, a1col, a2col, server) + logger.debug(f"e=_interactive_manh.get_csq_novel_variants(e, '{chrcol}', '{pscol}', '{a1col}', '{a2col}', '{server}', logger)") + e = _interactive_manh.get_csq_novel_variants(e, chrcol, pscol, a1col, a2col, server, logger) # Below gets the genes > d - d = _interactive_manh.get_overlap_genes(chrom, start, end, server) + d = _interactive_manh.get_overlap_genes(chrom, start, end, server, logger) e['gene']="" for index, row in d.iterrows(): external_name = row['external_name'] @@ -176,12 +173,12 @@ def interactive_manh(file, pvalcol, pscol, rscol, mafcol, chrcol, a1col, a2col, p2.add_layout(labels) p2.xaxis.visible = False else: - helper.info("\t\t\tšŸŒ No genes overlap this genomic region.") + logger.info("\t\t\tšŸŒ No genes overlap this genomic region.") # TODO: Move all the centromere code down here. if (len(cen_overlap)>0): # Add indication of the centromeric region in the plot perc_overlap=int((len(cen_overlap)/len(region))*100) - helper.info("\t\t\t {0}% of the genomic region overlaps a centromere".format(perc_overlap)) + logger.info("\t\t\t {0}% of the genomic region overlaps a centromere".format(perc_overlap)) xs=min(cen_overlap) xe=max(cen_overlap)+1 diff --git a/peakplotter/logging.py b/peakplotter/logging.py new file mode 100644 index 0000000..373cb8f --- /dev/null +++ b/peakplotter/logging.py @@ -0,0 +1,16 @@ +import logging + +def make_logger(file, level = logging.INFO): + fmt = logging.Formatter('[%(levelname)s] %(message)s') + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(fmt) + + fmt = logging.Formatter('[%(asctime)s %(levelname)s] %(message)s', datefmt = '%Y-%m-%d %H:%M:%S') + file_handler = logging.FileHandler(file) + file_handler.setFormatter(fmt) + + logger = logging.getLogger('main') + logger.addHandler(stream_handler) + logger.addHandler(file_handler) + logger.setLevel(level) + return logger \ No newline at end of file diff --git a/peakplotter/main.py b/peakplotter/main.py index 8f34f21..b573cb6 100755 --- a/peakplotter/main.py +++ b/peakplotter/main.py @@ -1,16 +1,18 @@ #!/usr/bin/env python3 import sys import shutil +import logging from pathlib import Path -from datetime import datetime import click from . import __version__ from ._data import get_data_path from .utils import check_executable, DEPENDENT_EXECUTABLES +from .tools import Plink from .errors import MissingExecutableError -from .plotpeaks import main +from .plotpeaks import main, process_peak +from .logging import make_logger @click.command() @@ -27,9 +29,10 @@ @click.option('-b', '--build', type = click.INT, default = 38, show_default=True, help = "Assembly build (37 or 38)") @click.option('-s', '--signif', type=click.FLOAT, default=5e-8, help = 'The significance level above which to declare a variant significant. Scientific notation (such as 5e-8) is fine.') @click.option('-bp', '--flank-bp', type = click.INT, default = 500_000, help = 'Flanking size in base pairs for drawing plots (defaults to 500kb, i.e. 1Mbp plots) around lead SNPs.') +@click.option('--debug', is_flag=True, flag_value = True, default = False, help = 'Set the log level from INFO to DEBUG.') @click.option('--overwrite', is_flag=True, flag_value = True, default = False, help = 'Overwrite output directory if it already exists.') @click.option('--version', is_flag=True, flag_value = True, default = False, help = 'Output version number of PeakPlotter') -def cli(assoc_file, bfiles, outdir, chr_col, pos_col, rs_col, pval_col, a1_col, a2_col, maf_col, build, signif, flank_bp, overwrite, version): +def cli(assoc_file, bfiles, outdir, chr_col, pos_col, rs_col, pval_col, a1_col, a2_col, maf_col, build, signif, flank_bp, debug, overwrite, version): '''PeakPlotter ''' if version is True: @@ -51,11 +54,7 @@ def cli(assoc_file, bfiles, outdir, chr_col, pos_col, rs_col, pval_col, a1_col, if missing_executables: raise MissingExecutableError(f"Executables missing: {', '.join(missing_executables)}") - - # TODO: Handle situation where only one file is given - if (ref_flat is None) and (recomb is None): - raise FileNotFoundError('Need to give ref_flat and recomb option') - # ref_flat, recomb = _get_locuszoom_data_path() + outdir = Path(outdir) if outdir.exists() and overwrite is False: @@ -67,7 +66,8 @@ def cli(assoc_file, bfiles, outdir, chr_col, pos_col, rs_col, pval_col, a1_col, outdir.mkdir() else: outdir.mkdir() - # Save run configurations in the output directory + + # Save run configurations in the output log file configs = { 'assoc_file': assoc_file, 'bfiles': bfiles, @@ -83,29 +83,144 @@ def cli(assoc_file, bfiles, outdir, chr_col, pos_col, rs_col, pval_col, a1_col, 'signif': signif, 'flank_bp': flank_bp } - now = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') - with open(outdir.joinpath(f'{outdir.name}.config.yaml'), 'x') as f: - f.write(f'peakplotter: {__version__}\n') - f.write(f'started: {now}\n') - for key, val in configs.items(): - f.write(f'{key}: {val}\n') - - main(signif, - assoc_file, - chr_col, - pos_col, - rs_col, - pval_col, - a1_col, - a2_col, - maf_col, - bfiles, - flank_bp, - ref_flat, - recomb, - build, - outdir, - memory = 30000) + log_level = logging.DEBUG if debug else logging.INFO + logger = make_logger(outdir.joinpath(f'{outdir.name}.log'), level = log_level) + logger.info(f'PeakPlotter Version: {__version__}') + arg_string = '\nArguments: \n' + for k, v in configs.items(): + arg_string += f' {k}: {v}\n' + logger.info(arg_string) + + try: + main(signif, + assoc_file, + chr_col, + pos_col, + rs_col, + pval_col, + a1_col, + a2_col, + maf_col, + bfiles, + flank_bp, + ref_flat, + recomb, + build, + outdir, + logger, + memory = 30000) + except Exception: + logger.critical('Unexpected error occurred:', exc_info = True) + raise + + + + + +@click.command() +@click.option('-a', '--assoc-file', type = click.Path(exists=True, dir_okay=False), required=True, help = 'Path to the association file. It can be gzipped, provided that it bears the .gz extension. Its first line must be a header, coherent with the name arguments below. It must be tab-separated, bgzipped and tabixed (tabix is available as part of bcftools)') +@click.option('-f', '--bfiles',type = click.STRING, required=True, help = 'Binary PLINK (.bed/.bim/.fam) file base name. This should contain the genotypes for at least all the variants in the assoc_file, but it can contain more. Please note that this is the base name, without the .bed/.bim/.fam extension.') +@click.option('-o', '--out', 'outdir', type = click.Path(file_okay=False, writable=True), required=True, help = 'Output directory to store all output files.') +@click.option('-chr', '--chr-col', type = click.STRING, required=True, help = 'Name of the column for chromosome names.') +@click.option('-ps', '--pos-col', type = click.STRING, required=True, help = 'Name of the column for chromosomal position.') +@click.option('-rs', '--rs-col', type = click.STRING, required=True, help = 'Name of the column for unique SNP ids (RS-id or chr:pos).') +@click.option('-p', '--pval-col', type = click.STRING, required=True, help = 'Name of the column for p-values.') +@click.option('-a1', '--a1-col', type = click.STRING, required=True, help = 'Name of the column for reference or major allele (used for predicting consequence).') +@click.option('-a2', '--a2-col', type = click.STRING, required=True, help = 'Name of the column for alternate or minor allele.') +@click.option('-maf', '--maf-col', type = click.STRING, required=True, help = 'Name of the column for non-reference or minor allele frequency.') +@click.option('-c', '--chrom', type = click.INT, required=True, help = 'Chromosome of the peak to plot') +@click.option('-s', '--start', type = click.INT, required=True, help = "Start of the peak to plot.") +@click.option('-e', '--end', type = click.INT, required=True, help = "End of the peak to plot.") +@click.option('-b', '--build', type = click.INT, default = 38, show_default=True, help = "Assembly build (37 or 38).") +@click.option('--debug', is_flag=True, flag_value = True, default = False, help = 'Set the log level from INFO to DEBUG.') +def cli_region(assoc_file, bfiles, outdir, chr_col, pos_col, rs_col, pval_col, a1_col, a2_col, maf_col, chrom, start, end, build, debug): + + ref_flat, recomb = get_data_path(build) + if not ref_flat.exists() or not recomb.exists(): + click.echo('[ERROR] Download of some data is required. Please run peakplotter-data-setup in the commandline') + sys.exit(1) + + # TODO: Change --bfile option type later to click.Path. + # We do STRING for now, because the plotpeaks.sh script accepts comma-separated filepaths. + missing_executables = list() + for exe in DEPENDENT_EXECUTABLES: + if not check_executable(exe): + missing_executables.append(exe) + + if missing_executables: + raise MissingExecutableError(f"Executables missing: {', '.join(missing_executables)}") + + # TODO: Handle situation where only one file is given + if (ref_flat is None) and (recomb is None): + raise FileNotFoundError('Need to give ref_flat and recomb option') + # ref_flat, recomb = _get_locuszoom_data_path() + + outdir = Path(outdir) + if not outdir.exists(): + outdir.mkdir() + # Save run configurations in the output directory + configs = { + 'run_mode': 'manual', + 'assoc_file': assoc_file, + 'bfiles': bfiles, + 'outdir': outdir, + 'chr_col': chr_col, + 'pos_col': pos_col, + 'rs_col': rs_col, + 'pval_col': pval_col, + 'a1_col': a1_col, + 'a2_col': a2_col, + 'maf_col': maf_col, + 'chrom': chrom, + 'start': start, + 'end': end, + 'build': build, + } + + log_level = logging.DEBUG if debug else logging.INFO + logger = make_logger(outdir.joinpath(f'{outdir.name}.log'), level = log_level) + logger.info(f'PeakPlotter Version: {__version__}') + arg_string = '\nArguments: \n' + for k, v in configs.items(): + arg_string += f' {k}: {v}\n' + logger.info(arg_string) + + + flank_bp = end - start + flank_kb = flank_bp // 1000 + ext_flank_kb = flank_kb + 100 + + bfiles_list = bfiles.split(',') + plink = Plink(30_000) + + try: + process_peak(assoc_file, + chr_col, + pos_col, + pval_col, + maf_col, + rs_col, + a1_col, + a2_col, + chrom, + start, + end, + 1, + outdir, + ref_flat, + recomb, + bfiles_list, + plink, + build, + ext_flank_kb, + logger) + except Exception: + logger.critical('Unexpected error occurred:', exc_info = True) + raise + + + + if __name__ == '__main__': cli() diff --git a/peakplotter/plotpeaks.py b/peakplotter/plotpeaks.py index 277c851..f40a444 100644 --- a/peakplotter/plotpeaks.py +++ b/peakplotter/plotpeaks.py @@ -12,7 +12,7 @@ from .peakit import peakit from .interactive_manh import interactive_manh -def read_assoc(filepath, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col, chunksize = 10000) -> pd.DataFrame: +def read_assoc(filepath, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col, logger, chunksize = 10000) -> pd.DataFrame: """ Lazily load and filter the association file. @@ -35,21 +35,21 @@ def read_assoc(filepath, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2 chunk[pval_col] = pd.to_numeric(chunk[pval_col], errors = 'coerce') nan_list = list(pd.isna(chunk[pval_col])) if nan_list.count(True): - print(f"[WARNING] Removing {nan_list.count(True)} rows with invalid p-value") + logger.debug(f"Removing {nan_list.count(True)} rows with invalid p-value") chunk = chunk.dropna().reset_index(drop = True) a1_check = chunk[a1_col].str.contains('[^ATGC]') if any(a1_check): invalid_strings = a1_check.to_list() - print(f"[WARNING] Removing {invalid_strings.count(True)} rows with invalid a1 string value") - print(chunk.loc[a1_check, [chr_col, rs_col, pos_col, a1_col, a2_col, maf_col, pval_col]]) + logger.debug(f"Removing {invalid_strings.count(True)} rows with invalid a1 string value") + logger.debug(chunk.loc[a1_check, [chr_col, rs_col, pos_col, a1_col, a2_col, maf_col, pval_col]]) chunk = chunk[~a1_check].reset_index(drop = True) a2_check = chunk[a2_col].str.contains('[^ATGC]') if any(a2_check): invalid_strings = a2_check.to_list() - print(f"[WARNING] Removing {invalid_strings.count(True)} rows with invalid a2 string value") - print(chunk.loc[a2_check, [chr_col, rs_col, pos_col, a1_col, a2_col, maf_col, pval_col]]) + logger.debug(f"Removing {invalid_strings.count(True)} rows with invalid a2 string value") + logger.debug(chunk.loc[a2_check, [chr_col, rs_col, pos_col, a1_col, a2_col, maf_col, pval_col]]) chunk = chunk[~a2_check].reset_index(drop = True) yield chunk @@ -132,23 +132,23 @@ def process_peak(assocfile: str, start: int, end: int, current: int, - total_peak_count: int, outdir: Path, refflat: Path, recomb: Path, bfiles_list: List[str], plink: Plink, build: int, - ext_flank_kb: int): - print(f"Treating peak {chrom} {start} {end} (peak {current+1} / {total_peak_count} )") - - assoc = read_assoc(assocfile, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col) + ext_flank_kb: int, + logger): + assoc = read_assoc(assocfile, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col, logger) + logger.info('Looking for signals..') concat_list = list() for chunk in assoc: filtered_chunk = chunk.loc[(chunk[chr_col]==chrom) & (start < chunk[pos_col]) & (chunk[pos_col] < end)] if filtered_chunk.shape[0] > 0: concat_list.append(filtered_chunk) + logger.info('Generating peakdata') peakdata = pd.concat(concat_list).sort_values([chr_col, pos_col]).reset_index(drop = True) peakdata[rs_col] = _create_non_rs_to_pos_id(peakdata, chr_col, rs_col, pos_col) # '1:100' -> 'chr1:100' @@ -160,32 +160,34 @@ def process_peak(assocfile: str, peakdata_chrpos_path = outdir.joinpath('peakdata.chrpos') db_file = outdir.joinpath(f'{chrom}.{start}.db') - + logger.debug(f'Generating peakdata.chrpos to {peakdata_chrpos_path}') peakdata_chrpos.to_csv(peakdata_chrpos_path, sep = '\t', index = False) - sp.check_output(shlex.split(f"dbmeister.py --db {db_file} --snp_pos {peakdata_chrpos_path}")) - sp.check_output(shlex.split(f"dbmeister.py --db {db_file} --refflat {refflat}")) - sp.check_output(shlex.split(f"dbmeister.py --db {db_file} --recomb_rate {recomb}")) + logger.info('Running dbmeister.py') + logger.debug(sp.check_output(shlex.split(f"dbmeister.py --db {db_file} --snp_pos {peakdata_chrpos_path}"))) + logger.debug(sp.check_output(shlex.split(f"dbmeister.py --db {db_file} --refflat {refflat}"))) + logger.debug(sp.check_output(shlex.split(f"dbmeister.py --db {db_file} --recomb_rate {recomb}"))) if start < 1: sensible_start = 1 - print(f'[nWARNING]\t Negative start position changed to {sensible_start} : {chrom} {start} (1)') + logger.warning(f'Negative start position changed to {sensible_start} : {chrom} {start} (1)') else: sensible_start = start - + logger.info(f'Extract {chrom}:{start}-{end} genotypes') mergelist = list() for count, bfile in enumerate(bfiles_list): out = outdir.joinpath(f'peak.{chrom}.{start}.{end}.{count}') - print(f"[DEBUG] plink.extract_genotypes('{bfile}', {chrom}, {start}, {end}, '{out}')") + logger.debug(f"plink.extract_genotypes('{bfile}', {chrom}, {start}, {end}, '{out}')") ps = plink.extract_genotypes(bfile, chrom, start, end, out) - print(ps.stdout.decode()) - print(ps.stderr.decode()) + logger.debug(ps.stdout.decode()) + logger.debug(ps.stderr.decode()) if ps.returncode==12: # TODO: Add some kind of test to ensure this part # A cohort can have no variants within a peak region # For example, peak was detected in chr1:200-300 driven by a variant # in cohortA, but cohortB has no variants within that region. + logger.warning('Error Code 12 for plink.extract_genotypes') continue ## Modify BIM file bimfile = f'{out}.bim' @@ -201,18 +203,19 @@ def process_peak(assocfile: str, ## Merge plink binaries if multiple present assert len(mergelist) >= 1, f'mergelist length is {len(mergelist)}' - print(f"[DEBUG] {mergelist}") - print(f"[INFO] Merging {mergelist}") + logger.debug(f"{mergelist}") + logger.info(f"Merging {mergelist}") mergelist_file = str(outdir.joinpath('mergelist')) out_merge = str(outdir.joinpath(f'peak.{current+1}')) - print(f"[DEBUG] plink.merge_region({mergelist_file}, '{mergelist}', {chrom}, {start}, {end}, '{out_merge}')") + logger.debug(f"plink.merge_region({mergelist_file}, '{mergelist}', {chrom}, {start}, {end}, '{out_merge}')") ps = plink.merge_region(mergelist_file, mergelist, chrom, start, end, out_merge) - print(ps.stdout.decode()) - print(ps.stderr.decode()) + logger.debug(ps.stdout.decode()) + logger.debug(ps.stderr.decode()) if ps.returncode == 3: + logger.info('Multiallelic variants detected. Excluding') # Exclude variants which failed merge missnp_file = f'{out_merge}-merge.missnp' missnp_list = pd.read_csv(missnp_file, header = None)[0].to_list() @@ -229,10 +232,10 @@ def process_peak(assocfile: str, Path(f'{bfile}.bed').unlink() Path(f'{bfile}.bim').unlink() Path(f'{bfile}.fam').unlink() - print("[DEBUG] plink.merge('{mergelist_file}', '{out_merge}')") + logger.debug("plink.merge('{mergelist_file}', '{out_merge}')") ps = plink.merge(mergelist_file, out_merge) - print(ps.stdout.decode()) - print(ps.stderr.decode()) + logger.debug(ps.stdout.decode()) + logger.debug(ps.stderr.decode()) index_of_var_with_lowest_pval = peakdata[pval_col].idxmin() @@ -246,11 +249,12 @@ def process_peak(assocfile: str, # pos = peakdata.loc[index_of_var_with_lowest_pval, pos_col] # refsnp = f'chr{chrom}:{pos}' - print(f"\n\nIn region {chrom} {start} {end}, top SNP is {refsnp}\n\n") + logger.info(f"\n\nIn region {chrom} {start} {end}, top SNP is {refsnp}\n\n") + logger.debug(f'plink.ld("{out_merge}", "{refsnp}", "{ext_flank_kb}", "{out_merge}")') ps = plink.ld(out_merge, refsnp, ext_flank_kb, out_merge) - print(ps.stdout.decode()) - print(ps.stderr.decode()) + logger.debug(ps.stdout.decode()) + logger.debug(ps.stderr.decode()) ld_data = pd.read_csv(f'{out_merge}.ld', delim_whitespace=True) ld_data = ld_data[['SNP_A', 'SNP_B', 'R2', 'R2']] ld_data.columns = ['snp1', 'snp2', 'dprime', 'rsquare'] @@ -267,18 +271,20 @@ def process_peak(assocfile: str, joined_peakdata_ld_file = outdir.joinpath(f'{chrom}.{start}.{end}.500kb') joined_peakdata_ld.to_csv(joined_peakdata_ld_file, sep = ',', header = True, index = False) - + logger.debug(f'run_locuszoom("{build}", "{peakdata_file}", "{refsnp}", "{rs_col}", "{pval_col}", "{db_file}", "{joined_peakdata_ld_file}", "{ld_file}", "{sensible_start}", "{end}", "{chrom}")') ps = run_locuszoom(build, peakdata_file, refsnp, rs_col, pval_col, db_file, joined_peakdata_ld_file, ld_file, sensible_start, end, chrom) - print(ps.stdout.decode()) - print(ps.stderr.decode()) - if build==37: - print(f"[DEBUG] interactive_manh({str(joined_peakdata_ld_file)}, {pval_col}, {pos_col}, {rs_col}, {maf_col}, {chr_col}, {a1_col}, {a2_col}, build = 'b37')") - interactive_manh(str(joined_peakdata_ld_file), pval_col, pos_col, rs_col, maf_col, chr_col, a1_col, a2_col, build = 'b37') - elif build==38: - print(f"[DEBUG] interactive_manh({str(joined_peakdata_ld_file)}, {pval_col}, {pos_col}, {rs_col}, {maf_col}, {chr_col}, {a1_col}, {a2_col}, build = 'b38')") - interactive_manh(str(joined_peakdata_ld_file), pval_col, pos_col, rs_col, maf_col, chr_col, a1_col, a2_col, build = 'b38') - print(f"Done with peak {chrom} {start} {end}.") - print("Cleaning plink binary files") + logger.debug(ps.stdout.decode()) + logger.debug(ps.stderr.decode()) + if build == 38: + b = 'b38' + elif build == 37: + b = 'b37' + + logger.debug(f"interactive_manh({str(joined_peakdata_ld_file)}, {pval_col}, {pos_col}, {maf_col}, {chr_col}, {a1_col}, {a2_col}, build = {b})") + interactive_manh(str(joined_peakdata_ld_file), pval_col, pos_col, maf_col, chr_col, a1_col, a2_col, build = b, logger = logger) + + logger.info(f"Done with peak {chrom} {start} {end}.") + logger.info("Cleaning plink binary files") to_delete = list(outdir.glob(f'peak.{chrom}.{start}.{end}.*.*')) for file in to_delete: file.unlink() @@ -290,28 +296,31 @@ def _make_done(outdir: Path): -def main(signif, assocfile, chr_col, pos_col, rs_col, pval_col, a1_col, a2_col, maf_col, bfiles, flank_bp, refflat, recomb, build, outdir, memory = 30000): +def main(signif, assocfile, chr_col, pos_col, rs_col, pval_col, a1_col, a2_col, maf_col, bfiles, flank_bp, refflat, recomb, build, outdir, logger, memory = 30000): # ext_flank_bp = flank_bp + 100_000 flank_kb = flank_bp // 1000 ext_flank_kb = flank_kb + 100 bfiles_list = bfiles.split(',') plink = Plink(memory) - assoc = read_assoc(assocfile, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col) + assoc = read_assoc(assocfile, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col, logger) + logger.debug('Getting signals') signals = get_signals(assoc, signif, chr_col, pos_col, pval_col) if signals.empty: - print("No peaks found. Exiting.") + logger.info("No peaks found. Exiting.") _make_done(outdir) sys.exit(0) + logger.debug('Making peak_collections') peak_collections = peakit(signals, pval_col, chr_col, pos_col, flank_bp) peak_collections.merge() peaked = peak_collections.data - + logger.debug(peaked) peaked_file = outdir.joinpath('peaked') peaked.to_csv(peaked_file, sep = '\t', header = True, index = False) total_peak_count = peaked.shape[0] for current, (chrom, start, end) in peaked.iterrows(): + logger.info(f"Treating peak {chrom} {start} {end} (peak {current+1} / {total_peak_count} )") process_peak(assocfile, chr_col, pos_col, @@ -324,13 +333,13 @@ def main(signif, assocfile, chr_col, pos_col, rs_col, pval_col, a1_col, a2_col, start, end, current, - total_peak_count, outdir, refflat, recomb, bfiles_list, plink, build, - ext_flank_kb) + ext_flank_kb, + logger) _make_done(outdir) - print('Finished..') + logger.info('Finished') diff --git a/peakplotter/test_utils.py b/peakplotter/test_utils.py new file mode 100644 index 0000000..2eb5b36 --- /dev/null +++ b/peakplotter/test_utils.py @@ -0,0 +1,5 @@ +import logging + + +def get_test_logger(): + return logging.getLogger('test') \ No newline at end of file diff --git a/peakplotter/utils.py b/peakplotter/utils.py index 2abf703..638f29b 100755 --- a/peakplotter/utils.py +++ b/peakplotter/utils.py @@ -11,7 +11,6 @@ 'locuszoom', # Locuszoom 'dbmeister.py', # Locuszoom 'plink', # Plink - 'sponge' # Moreutils ] diff --git a/setup.py b/setup.py index a10dc6d..35ca2d8 100755 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ entry_points = { 'console_scripts': [ 'peakplotter = peakplotter.main:cli', + 'peakplotter-region = peakplotter.main:cli_region', 'peakplotter-data-setup = peakplotter._data:setup_data', ], }, diff --git a/test/test__interactive_manh.py b/test/test__interactive_manh.py index 5c251f4..bbd7c35 100755 --- a/test/test__interactive_manh.py +++ b/test/test__interactive_manh.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import testing -from peakplotter import helper -from peakplotter._interactive_manh import make_resp, get_centromere_region, query_vep, _get_csq_novel_variants +from peakplotter._interactive_manh import make_resp, get_centromere_region, query_vep, _get_csq_novel_variants, get_build_server +from peakplotter.test_utils import get_test_logger def test_make_resp_when_pheno_df_is_empty(): @@ -82,14 +82,15 @@ def test_query_vep(): pos = pd.Series([26960070, 26965148], dtype = np.int64) a1 = pd.Series(['G', 'G'], dtype = str) a2 = pd.Series(['A', 'A'], dtype = str) - server = helper.get_build_server(38) - output = query_vep(chrom, pos, a1, a2, server) + server = get_build_server(38) + logger = get_test_logger() + output = query_vep(chrom, pos, a1, a2, server, logger) assert len(output)==2 def test__get_csq_novel_variants(): chrcol, pscol, a1col, a2col = "chrom", "ps", "a1", "a2" - server = helper.get_build_server(38) + server = get_build_server(38) e = pd.DataFrame([ [21, 26960070, 'G', 'A', 'novel', 0.5, ''], [21, 26965148, 'G', 'A', 'novel', 0.5, ''] @@ -99,6 +100,8 @@ def test__get_csq_novel_variants(): [21, 26960070, 'G', 'A', 'novel', 0.5, 'intron_variant'], [21, 26965148, 'G', 'A', 'novel', 0.5, 'intron_variant'] ], columns = [chrcol, pscol, a1col, a2col, 'ensembl_rs', 'ld', 'ensembl_consequence']) + logger = get_test_logger() + output = _get_csq_novel_variants(e, chrcol, pscol, a1col, a2col, server, logger) - output = _get_csq_novel_variants(e, chrcol, pscol, a1col, a2col, server) + testing.assert_frame_equal(expected, output) diff --git a/test/test_plotpeaks.py b/test/test_plotpeaks.py index a5c1d16..05cc043 100644 --- a/test/test_plotpeaks.py +++ b/test/test_plotpeaks.py @@ -3,7 +3,7 @@ import pandas as pd from peakplotter import plotpeaks - +from peakplotter.test_utils import get_test_logger def test_read_assoc_with_METAL_output(): """ @@ -24,7 +24,8 @@ def test_read_assoc_with_METAL_output(): example_buff = io.StringIO() example.to_csv(example_buff, sep = '\t', header = True, index = False) example_buff.seek(0) - iterable = plotpeaks.read_assoc(example_buff, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col) + logger = get_test_logger() + iterable = plotpeaks.read_assoc(example_buff, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col, logger) data = next(iterable) expected = pd.DataFrame([ @@ -49,7 +50,8 @@ def test_read_assoc_with_invalid_allele_string(): example_buff = io.StringIO() example.to_csv(example_buff, sep = '\t', header = True, index = False) example_buff.seek(0) - iterable = plotpeaks.read_assoc(example_buff, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col) + logger = get_test_logger() + iterable = plotpeaks.read_assoc(example_buff, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col, logger) data = next(iterable) expected = pd.DataFrame([ @@ -77,7 +79,8 @@ def test_get_signals_when_no_signif_signals(): example_buff = io.StringIO() example.to_csv(example_buff, sep = '\t', header = True, index = False) example_buff.seek(0) - iterable = plotpeaks.read_assoc(example_buff, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col) + logger = get_test_logger() + iterable = plotpeaks.read_assoc(example_buff, chr_col, pos_col, pval_col, maf_col, rs_col, a1_col, a2_col, logger) signals = plotpeaks.get_signals(iterable, 5e-8, chr_col, pos_col, pval_col)