Skip to content

Commit

Permalink
Feature/vaf filter (#8)
Browse files Browse the repository at this point in the history
* added config to handle VAF cutoff and other filters

* corrected typo

* corrected typo
  • Loading branch information
sb43 authored Mar 31, 2021
1 parent 7873dae commit 31b5a3d
Show file tree
Hide file tree
Showing 64 changed files with 434 additions and 210 deletions.
18 changes: 6 additions & 12 deletions annotate/commandline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
configdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config/')
log_config = configdir + 'logging.conf'
info_header = configdir + 'info.header'
filters_json = configdir + 'filters.json'
logging.config.fileConfig(log_config)
log = logging.getLogger(__name__)
version = pkg_resources.require("annotateVcf")[0].version
Expand All @@ -27,28 +28,21 @@ def main():
required.add_argument("-vcf", "--vcf_file", type=str, dest="vcf_file", required=True,
default=None, help="vcf_file to annotate")

optional.add_argument("-filter", "--vcf_filter", type=str, dest="vcf_filter", nargs='+',
required=False, default=['PASS'], help="Include variant sites \
matching vcf FILTER flag(s), multiple flags can be specified \
with space separator")
optional.add_argument("-filters", "--vcf_filters", type=str, dest="vcf_filters", required=False,
default=filters_json, help="Include vcf filters \
configuration file in json (param:value) format \
[please refer bcftools documentation for more details \
: http://samtools.github.io/bcftools/bcftools.html#expressions]")

optional.add_argument("-np", "--normal_panel", type=str, dest="normal_panel", required=False,
default=None, help="normal panel file to flag germline variant sites")

optional.add_argument("-gt", "--germline_tag", type=str, dest="germline_tag", required=False,
default="NPGL", help="tag to mark normal panel filtered variants in \
vcf INFO field, only applicable when -np is set")

optional.add_argument("-g", "--lof_genes", type=str, dest="lof_genes", required=False,
default=None, help="LoF gene name file to use annotations")

optional.add_argument("-m", "--mutations", type=str, dest="mutations", required=False,
default=None, help="driver mutations file to use for driver variant annotations")

optional.add_argument("-lof", "--lof_type", type=str, dest="lof_type", nargs='+', metavar='N',
required=False, default=["stop_lost", "start_lost", "ess_splice",
"frameshift", "nonsense"], help="Loss of function effect types")

optional.add_argument("-hl", "--header_line", type=str, dest="header_line",
required=False, default=info_header, help="vcf info header line and info tag")

Expand Down
17 changes: 17 additions & 0 deletions annotate/config/filters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"_#comment": [
"Add filters to include/ exclude the variants sites to annotate using drivers",
"please refer filter expression formats to add new filters : https://samtools.github.io/bcftools/bcftools.html#expressions"
],
"include": {
"FILTER": "FILTER=\"PASS\"",
"FORMAT": "FORMAT/VAF[*] > 0.15",
"INFO": "INFO/VC=\"stop_lost,start_lost,ess_splice,frameshift,nonsense\"",
"INFO_FLAG_GERMLINE": "NPGL"
},

"exclude": {
"None": null
}
}

59 changes: 28 additions & 31 deletions annotate/io_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import io
import tempfile
import pkg_resources
import json
import shutil

from contextlib import contextmanager
Expand All @@ -24,12 +25,10 @@ def __init__(self, **kwargs):
self.genes_file = kwargs.get('lof_genes', None)
self.muts_file = kwargs.get('mutations', None)
self.np_vcf = kwargs.get('normal_panel', None)
self.np_tag = kwargs['germline_tag']
self.lof_type = kwargs['lof_type']
self.header_line = kwargs['header_line']
self.outdir = kwargs['outdir']
self.keepTmp = kwargs['keepTmp']
self.filter = kwargs.get('vcf_filter', None)
self.json_file = kwargs.get('vcf_filters', None)
# check input data ...

def format(self, input_array):
Expand All @@ -47,14 +46,12 @@ def format(self, input_array):
def _get_formatter(self, input_type):
if input_type == 'vcf_file':
return self._format_vcf
elif input_type == 'lof_type':
return self._format_lof
elif input_type == 'outdir':
return self._get_outdir_path
elif input_type == 'format_filter':
return self._format_vcf_filter
elif input_type == 'input_status':
return self._check_input
elif input_type == 'vcf_filters':
return self._get_filters
else:
raise ValueError(input_type)

Expand All @@ -72,19 +69,13 @@ def _check_input(self):
def _format_vcf(self):
return get_file_metadata(self.vcf_file)

def _format_vcf_filter(self):
def _get_filters(self):
"""
formatter function
:return:
"""
return format_filter(self.filter, 'FILTER=')

def _format_lof(self):
load parameters from json config file
"""
format lof consequence types to filter
:return:
"""
return format_filter(self.lof_type, 'INFO/VC=')
inc_filters = ['FORMAT', 'FILTER', 'INFO', 'INFO_FLAG_GERMLINE']
formatted_filters = parse_filters(self.json_file, 'include', inc_filters)
return formatted_filters

def _get_outdir_path(self):
"""
Expand Down Expand Up @@ -113,19 +104,6 @@ def check_inputs(file_dict):
return file_dict


def format_filter(filter_vals, filter_type):
"""
:param filter_vals: for vcf filter field
:param filter_type: filter prefix [e.g., FILTER, INFO, etc ]
:return: formatted filter string
"""
format_store = []
for val in filter_vals:
format_store.append(filter_type + "\"" + val + "\"")
return ' || '.join(format_store)


def get_file_metadata(full_file_name):
"""
takes file path as input and gives its path and processed extension
Expand All @@ -139,6 +117,25 @@ def get_file_metadata(full_file_name):
return file_metadata


def parse_filters(json_file, filter_type, filters):
"""
load filtering parameters from json config file
"""
filter_param_dict = {}
try:
if json_file is None:
sys.exit('Json configuration file must be provided')
with open(json_file, 'r') as cfgfile:
filter_cfg = json.load(cfgfile)
for filter in filters:
filter_param_dict[filter] = filter_cfg[filter_type][filter]
except json.JSONDecodeError as jde:
sys.exit('json error:{}'.format(jde.args[0]))
except FileNotFoundError as fne:
sys.exit('Can not find json file:{}'.format(fne.args[0]))
return filter_param_dict


@contextmanager
def tempdir(mypath):
"""
Expand Down
50 changes: 32 additions & 18 deletions annotate/vcf_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def __init__(self, f, basedir):
self.outdir = basedir
self.status_dict = f.format(['input_status'])
self.input_data = f.format(['vcf_file'])

# set input vcf parameters ...
self._set_input_vcf(self.input_data)
self.drv_header = f.header_line
Expand All @@ -42,24 +41,27 @@ def _runAnalysis(self, f):
:return:
"""
status = self.status_dict['input_status']
vcf_filters = f.format(['format_filter'])
self.vcf_filter = vcf_filters['format_filter']
vcf_filters = f.format(['vcf_filters'])
vcf_filter_params = vcf_filters.get('vcf_filters', None)
self.info_filters = vcf_filter_params['INFO']
self.format_filters = vcf_filter_params['FORMAT']
self.filter_filters = vcf_filter_params['FILTER']
self.flag_germline = vcf_filter_params['INFO_FLAG_GERMLINE']
run_status = False
a_type = ['normal_panel', 'mutations', 'lof_genes']

for analysis in a_type:
if status[analysis] and analysis == 'normal_panel':
logging.info(f"Tagging germline variants with INFO tag:{f.np_tag}")
self.tag_germline_vars(f.np_tag, f.np_vcf)
logging.info(f"Tagging germline variants with INFO tag:{self.flag_germline}")
self.tag_germline_vars(f.np_vcf)
run_status = True
if status[analysis] and analysis == 'mutations':
logging.info("Annotating driver mutations with INFO field:DRV=<consequence(s)>")
self.annot_drv_muts(f.muts_file)
run_status = True
if status[analysis] and analysis == 'lof_genes':
logging.info("Annotating LoF genes with INFO filed:DRV=LoF")
lof_types = f.format(['lof_type'])
self.annotate_lof_genes(f.genes_file, lof_types['lof_type'])
logging.info("Annotating LoF genes with INFO field:DRV=LoF")
self.annotate_lof_genes(f.genes_file)
run_status = True
if run_status:
logging.info("concatenating results")
Expand All @@ -79,24 +81,23 @@ def _set_input_vcf(self, input_data):
self.vcf_name = input_data['vcf_file']['name']
self.outfile_name = self.outdir + '/' + self.vcf_name + '{}'

def tag_germline_vars(self, np_tag, np_vcf):
def tag_germline_vars(self, np_vcf):
"""
use normal panel to tag germline variants and created filtered vcf file to use in
use normal panel to tag germline variants and create filtered vcf file to use in
subsequent driver annotation steps ...
sets filtered vcf as new user input parameter for downstream analysis
add tagged and filtered vcf files to concat in final step
:param np_tag:
:param np_vcf:
:return:
"""
tagged_vcf = self.outfile_name.format('_np.vcf.gz')
filtered_vcf = self.outfile_name.format('_np_filtered.vcf.gz')

cmd = f"bcftools annotate -a {np_vcf} -i '{self.vcf_filter}' -m '{np_tag}'" \
cmd = f"bcftools annotate -a {np_vcf} -i '{self.filter_filters}' -m '{self.flag_germline}'" \
f" {self.vcf_path} | bgzip -c >{tagged_vcf} && tabix -p vcf {tagged_vcf}"
_run_command(cmd)

cmd = f"bcftools view -i '({self.vcf_filter}) && {np_tag}=0' {tagged_vcf} | " \
cmd = f"bcftools view -i '{self.flag_germline}=0' {tagged_vcf} | " \
f"bgzip -c >{filtered_vcf} && tabix -p vcf {filtered_vcf}"

_run_command(cmd)
Expand All @@ -118,19 +119,20 @@ def annot_drv_muts(self, muts_file):
:return:
"""
muts_outfile = self.outfile_name.format('_muts.vcf.gz')
cmd = f"bcftools annotate -i '{self.vcf_filter}' --merge-logic DRV:unique" \
combined_filter = _combine_filters([self.filter_filters, self.format_filters])
cmd = f"bcftools annotate -i '{combined_filter}'" \
f" --merge-logic DRV:unique" \
f" -a {muts_file} -h {self.drv_header} " \
f"-c CHROM,FROM,TO,INFO/DRV {self.vcf_path} |" \
f"bcftools annotate -i 'DRV!=\".\" && DRV[*]==VC' | " \
f"bgzip -c >{muts_outfile} && tabix -f -p vcf {muts_outfile}"
_run_command(cmd)
self.merge_vcf_dict['a'] = muts_outfile

def annotate_lof_genes(self, genes_file, lof_types):
def annotate_lof_genes(self, genes_file):
"""
annotate vcf using known LoF genes
:param genes_file: lof gene names file
:param lof_types: lof consequences type string
:return:
"""
# create dummy genome locationo file to annoate LoF genes...
Expand All @@ -140,9 +142,10 @@ def annotate_lof_genes(self, genes_file, lof_types):
genes_outfile = self.outfile_name.format('_genes.vcf')
lof_outfile = self.outfile_name.format('_genes_lof.vcf')
lof_gene_list = get_drv_gene_list(genes_file)
# map lof effect types to pass variants...
cmd = f"bcftools annotate -a {genome_loc_file} -i '({self.vcf_filter}) && ({lof_types})' " \
combined_filter = _combine_filters([self.filter_filters, self.format_filters, self.info_filters])
cmd = f"bcftools annotate -a {genome_loc_file} -i '{combined_filter}' " \
f"-h {self.drv_header} -c CHROM,FROM,TO,INFO/DRV {self.vcf_path} >{genes_outfile}"

_run_command(cmd)
with open(lof_outfile, "w") as lof_fh, open(genes_outfile, 'r') as gene_f:
for line in gene_f:
Expand Down Expand Up @@ -172,6 +175,17 @@ def concat_results(self):


# generic methods ....
def _combine_filters(filter_array):
"""
:param filter_array: filtring paramters
:return: return formatted filtering parameters if present otherwise () equivalent to no filter...
"""
if any(filter_array):
return f"({') && ('.join(filter(None, filter_array))})"
else:
return "()"


def _get_gene(line, gene_field, field_loc):
# Not used ... kept for future implementation of different annotation fields....
# ANN=T|missense_variant|MODERATE|AGAP005273|AGAP005273| [ e.g. 'ANN', 3]
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
'python_requires': '>= 3.6',
'install_requires': ['tzlocal'],
'packages': ['annotate'],
'package_data': {'annotate':['config/*.conf','config/*.header']},
'package_data': {'annotate':['config/*.conf','config/*.header','config/*.json']},
'entry_points': {
'console_scripts': ['annotateVcf=annotate.commandline:main'],
}
Expand Down
42 changes: 28 additions & 14 deletions tests/test_celline_vcf_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,12 @@ class TestClass():
test_out = configdir + '/test_output/'

options_vcf_celline = {'vcf_file': test_dir + 'celline.vcf.gz',
'vcf_filter': ['PASS'],
'lof_genes': test_dir + 'lof_genes_v1.0.txt',
'mutations': test_dir + 'driver_mutations_sorted.tsv.gz',
'lof_type': ["stop_lost","start_lost","ess_splice","frameshift","nonsense"],
'vcf_filters': test_dir + 'filters.json',
'header_line': test_dir + 'info.header',
'outdir': test_dir + 'tmpout',
'keepTmp': False,
'germline_tag': 'NPGL',
'normal_panel': test_dir+'np.vcf.gz'
}

Expand All @@ -33,9 +31,11 @@ class TestClass():
}}
file_dict = {'input_status': {'mutations': True, 'lof_genes': True,
'normal_panel': True, 'vcf_file': True}}
lof_type = {
'lof_type': 'INFO/VC="stop_lost" || INFO/VC="start_lost" || INFO/VC="ess_splice" || INFO/VC="frameshift" || INFO/VC="nonsense"'}
my_filter = {'format_filter': 'FILTER="PASS"'}

info_filter="INFO/VC=\"stop_lost,start_lost,ess_splice,frameshift,nonsense\""
format_filter= "FORMAT/VAF[*] > 0.15"
filters_filter= "FILTER=\"PASS\""
info_flag_germline="NPGL"
# celline output
muts_vcf = f"{test_out}/celline_muts.vcf.gz"
lof_vcf = f"{test_out}/celline_genes_lof.vcf.gz"
Expand All @@ -46,23 +46,37 @@ class TestClass():
def test_celline_vcf_input(self):
# check input type function
f = self.my_formatter
file_metadata = self.file_metadata
assert file_metadata == f.format(['vcf_file']),'test_vcf_input test OK'
assert self.file_metadata == f.format(['vcf_file']),'test_vcf_input test OK'

def test_celline_file_input(self):
file_dict=self.file_dict
f = self.my_formatter
assert file_dict == f.format(['input_status']),'test_file_input test OK'

def test_celline_lof_format(self):
lof_type=self.lof_type
def test_celline_info_filter(self):
f = self.my_formatter
vcf_filters = f.format(['vcf_filters'])
vcf_filter_params = vcf_filters.get('vcf_filters', None)
assert self.info_filter == vcf_filter_params['INFO'],'test_INFO test OK'

def test_celline_format_filter(self):
f = self.my_formatter
assert lof_type == f.format(['lof_type']),'test_lof_format test OK'
vcf_filters = f.format(['vcf_filters'])
vcf_filter_params = vcf_filters.get('vcf_filters', None)
assert self.format_filter == vcf_filter_params['FORMAT'],'test_FORMAT test OK'

def test_celline_filter_format(self):
my_filter=self.my_filter
def test_celline_filter_filters(self):
f = self.my_formatter
assert my_filter == f.format(['format_filter']),'test_filter_format test OK'
vcf_filters = f.format(['vcf_filters'])
vcf_filter_params = vcf_filters.get('vcf_filters', None)
assert self.filters_filter == vcf_filter_params['FILTER'],'test_FILTER test OK'

def test_celline_info_flag_germline(self):
f = self.my_formatter
vcf_filters = f.format(['vcf_filters'])
vcf_filter_params = vcf_filters.get('vcf_filters', None)
assert self.info_flag_germline == vcf_filter_params['INFO_FLAG_GERMLINE'],'test_INFO_FLAG_GERMLINE test OK'

def chek_celline_outdir(slef):
self.test_dir + 'tmpout' == self.outdir_path

Expand Down
11 changes: 11 additions & 0 deletions tests/test_input/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
annotateVcf -vcf celline.vcf.gz -g lof_genes_v1.0.txt -m driver_mutations_sorted.tsv.gz -np np.vcf.gz -o test_output -t
annotateVcf -vcf input.vcf.gz -g lof_genes_v1.0.txt -m driver_mutations_sorted.tsv.gz -o test_output -t -filters filters.json
bcftools query -f '%CHROM\t%END\n' ../celline.vcf.gz >cell_linergions.txt

bcftools view /lustre/scratch117/casm/team215/sb43/user_query/cell_line_vafs_comparison/vafcorrect_broad_wgs/vafcorrect_out/output/PDv38is_wgs/snp/PDv38is_wgs_ACH-000879_snp_vaf.vcf.gz -R cell_linergions.txt | bgzip -c >../celline.vcf.gz

bcftools query -f '%CHROM\t%END\n' ../input.vcf.gz >org_pos.txt

bcftools view /lustre/scratch117/casm/team215/sb43/organoid_analysis/WGS/p2126/out_vafcorrect/output/WTSI-COLO_005_b/snp/WTSI-COLO_005_b_WTSI-COLO_005_a_DNA_snp_vaf.vcf.gz -R org_pos.txt >input.vcf


Binary file modified tests/test_input/celline.vcf.gz
Binary file not shown.
Binary file modified tests/test_input/celline.vcf.gz.tbi
Binary file not shown.
Loading

0 comments on commit 31b5a3d

Please sign in to comment.