diff --git a/CHANGES.md b/CHANGES.md index dffdf01..e7e5f82 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,5 @@ +## 1.1.4 +* updated regex to get genename ## 1.1.3 * fixed bug in gene name comparison with hyphen(-) * corrected typo in default directory name diff --git a/Dockerfile b/Dockerfile index cda4cdc..c3dc72b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ USER root MAINTAINER cgphelp@sanger.ac.uk -ENV ANNOTATEVCF_VER '1.1.3' +ENV ANNOTATEVCF_VER '1.1.4' # install system tools RUN apt-get -yq update diff --git a/annotate/commandline.py b/annotate/commandline.py index 28f9ef7..2f38e71 100644 --- a/annotate/commandline.py +++ b/annotate/commandline.py @@ -17,7 +17,7 @@ def main(): - usage = "\n %prog [options] -vcf input.vcf [-drv_json test.json -drv_data test_dir] " + usage = "\n %prog [options] -vcf input.vcf [-filter -np -gt -g -m -lof -hl -o ]" optParser = argparse.ArgumentParser(prog='annotateVcf', formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -27,7 +27,7 @@ def main(): required.add_argument("-vcf", "--vcf_file", type=str, dest="vcf_file", required=True, default=None, help="vcf_file to annotate") - required.add_argument("-filter", "--vcf_filter", type=str, dest="vcf_filter", nargs='+', + optional.add_argument("-filter", "--vcf_filter", type=str, dest="vcf_filter", nargs='+', required=False, default=['PASS'], help="Include variant sites \ matching vcf FILTER flag(s), multiple flags can be specified \ with space separator") @@ -36,7 +36,8 @@ def main(): default=None, help="normal panel file to flag germline variant sites") optional.add_argument("-gt", "--germline_tag", type=str, dest="germline_tag", required=False, - default="NPGL", help="tag to mark normal panel filtered variants in vcf INFO field") + default="NPGL", help="tag to mark normal panel filtered variants in \ + vcf INFO field, only applicable when -np is set") optional.add_argument("-g", "--lof_genes", type=str, dest="lof_genes", required=False, default=None, help="LoF gene name file to use annotations") diff --git a/annotate/vcf_annotator.py b/annotate/vcf_annotator.py index 7fc6dee..6dbf58d 100644 --- a/annotate/vcf_annotator.py +++ b/annotate/vcf_annotator.py @@ -133,8 +133,8 @@ def annotate_lof_genes(self, genes_file, lof_types): :param lof_types: lof consequences type string :return: """ - get_gene = re.compile(r'.*;VD=(\w+)|.*') # create dummy genome locationo file to annoate LoF genes... + get_gene = re.compile(r'\bVD=([-\w]+)') genome_loc_file = self.outdir + '/genome.tab.gz' create_dummy_genome(self.vcf_path, genome_loc_file) genes_outfile = self.outfile_name.format('_genes.vcf') @@ -150,9 +150,10 @@ def annotate_lof_genes(self, genes_file, lof_types): if line.startswith('#'): lof_fh.write(line) else: - gene = line.split('VD=')[1].split('|')[0] + gene = get_gene.search(line)[1] + # gene = _get_gene('VD', 0)) # write matching LoF genes.... - if gene in lof_gene_list: + if gene.upper() in lof_gene_list: lof_fh.write(line) self.merge_vcf_dict['b'] = compress_vcf(lof_outfile) @@ -171,9 +172,19 @@ def concat_results(self): # generic methods .... +def _get_gene(line, gene_field, field_loc): + # Not used ... kept for future implementation of different annotation fields.... + # ANN=T|missense_variant|MODERATE|AGAP005273|AGAP005273| [ e.g. 'ANN', 3] + # VD=TP5-TEST1-TEST2|CCDS11118.1|r.276_277insa|c.86_87insA|p.N29fs*14| [ e.g. 'VD', 0 ] + info_list = line.split("\t")[7].split(';') + info_dict = dict(f.split('=') for f in info_list if '=' in f) + gene = info_dict[gene_field].split('|')[field_loc] + return gene.upper() + + def get_drv_gene_list(drv_genes): with open(drv_genes) as f_drv: - lof_gene_list = f_drv.read().splitlines() + lof_gene_list = [gene.upper() for gene in f_drv.read().splitlines()] return lof_gene_list diff --git a/setup.py b/setup.py index 5297938..986a553 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup config = { - 'version': '1.1.3', + 'version': '1.1.4', 'name': 'annotateVcf', 'description': 'Tool to annotate and filter vcf files...', 'author': 'Shriram Bhosle',