Skip to content

Commit

Permalink
Merge pull request #12 from cortes-ciriano-lab/vcf-updates
Browse files Browse the repository at this point in the history
VCF Updates
  • Loading branch information
helrick authored Feb 15, 2023
2 parents 4b9fe1c + 315d52a commit e1aa147
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 17 deletions.
12 changes: 9 additions & 3 deletions savana/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,13 @@ def as_vcf(self, ref_fasta):
gt_tag = '0/1'
support_str = ';'.join([f'{label.upper()}_SUPPORT={label_count}' for label, label_count in self.support.items()])
stats_str = self.get_stats_str()
info = f'{support_str};SVLEN={self.sv_length};BP_NOTATION={self.breakpoint_notation};{stats_str}'
info = [f'{support_str};SVLEN={self.sv_length};BP_NOTATION={self.breakpoint_notation};{stats_str}']
if self.breakpoint_notation == "<INS>":
info[0] = 'SVTYPE=INS;' + info[0]
else:
info.append(info[0]) # duplicate info
info[0] = f'SVTYPE=BND;MATEID=ID_{self.count}_2;' + info[0]
info[1] = f'SVTYPE=BND;MATEID=ID_{self.count}_1;' + info[1]
# put together vcf line(s)
vcf_lines = [[
self.start_chr,
Expand All @@ -203,7 +209,7 @@ def as_vcf(self, ref_fasta):
alts[0],
'.',
'PASS',
info,
info[0],
'GT',
gt_tag
]]
Expand All @@ -216,7 +222,7 @@ def as_vcf(self, ref_fasta):
alts[1],
'.',
'PASS',
info,
info[1],
'GT',
gt_tag
])
Expand Down
10 changes: 6 additions & 4 deletions savana/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from datetime import datetime

__version__ = "0.2.2"
__version__ = "0.2.3"

samflag_desc_to_number = {
"BAM_CMATCH": 0, # M
Expand Down Expand Up @@ -238,19 +238,21 @@ def generate_vcf_header(ref_fasta, ref_fasta_index, tumour_file, example_breakpo
"##source=SAVANA.Beta",
f'##reference={ref_fasta}',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
'##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">',
'##INFO=<ID=NORMAL_SUPPORT,Number=1,Type=Float,Description="Number of variant supporting normal reads">',
'##INFO=<ID=TUMOUR_SUPPORT,Number=1,Type=Float,Description="Number of variant supporting tumour reads">',
'##INFO=<ID=SVLEN,Number=1,Type=Float,Description="Length of the SV">',
'##INFO=<ID=ORIGINATING_CLUSTER,Number=.,Type=String,Description="SAVANA internal originating cluster id supporting variant">',
'##INFO=<ID=END_CLUSTER,Number=.,Type=String,Description="SAVANA internal end cluster id supporting variant">',
'##INFO=<ID=BP_NOTATION,Number=.,Type=String,Description="+- notation format of variant (same for paired breakpoints)">'
'##INFO=<ID=BP_NOTATION,Number=1,Type=String,Description="+- notation format of variant (same for paired breakpoints)">'
])
breakpoint_stats_origin = example_breakpoint.originating_cluster.get_stats().keys()
breakpoint_stats_end = example_breakpoint.end_cluster.get_stats().keys()
for stat in breakpoint_stats_origin:
vcf_header_str.append(f'##INFO=<ID=ORIGIN_{stat.upper()},Number=1,Type=Float,Description="Originating cluster heuristic stat to measure {stat}">')
vcf_header_str.append(f'##INFO=<ID=ORIGIN_{stat.upper()},Number=1,Type=Float,Description="Originating cluster value for {stat}">')
for stat in breakpoint_stats_end:
vcf_header_str.append(f'##INFO=<ID=END_{stat.upper()},Number=1,Type=Float,Description="End cluster heuristic stat to measure {stat}">')
vcf_header_str.append(f'##INFO=<ID=END_{stat.upper()},Number=1,Type=Float,Description="End cluster value for {stat}">')
assembly_name = os.path.basename(ref_fasta)
with open(ref_fasta_index) as f:
reader = csv.reader(f, delimiter='\t')
Expand Down
23 changes: 15 additions & 8 deletions savana/savana.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from time import time
from math import ceil
from pathlib import Path
from multiprocessing import Pool, cpu_count

import pysam
Expand Down Expand Up @@ -171,13 +172,13 @@ def spawn_processes(args, bam_files, checkpoints, time_str, outdir):
vcf_string += bp.as_vcf(ref_fasta)
read_support_string += bp.as_read_support(count)
variant_stats_string += bp.as_variant_stats(count, variant_stats_cols)
with open(os.path.join(outdir, "sv_breakpoints.vcf"), 'w') as output:
with open(os.path.join(outdir, f'{args.sample}.sv_breakpoints.vcf'), 'w') as output:
output.write(vcf_string)
with open(os.path.join(outdir, "sv_breakpoints.bedpe"), 'w') as output:
with open(os.path.join(outdir, f'{args.sample}.sv_breakpoints.bedpe'), 'w') as output:
output.write(bedpe_string)
with open(os.path.join(outdir, "sv_breakpoints_read_support.tsv"), 'w') as output:
with open(os.path.join(outdir, f'{args.sample}.sv_breakpoints_read_support.tsv'), 'w') as output:
output.write(read_support_string)
with open(os.path.join(outdir, "variant.stats"), 'w') as output:
with open(os.path.join(outdir, 'variant.stats'), 'w') as output:
output.write(variant_stats_string)
if args.debug:
time_function("Output consensus breakpoints", checkpoints, time_str)
Expand All @@ -188,13 +189,13 @@ def spawn_processes(args, bam_files, checkpoints, time_str, outdir):
lenient_vcf_string = helper.generate_vcf_header(args.ref, args.ref_index, args.tumour, validated_breakpoints[0])
for bp in somatic_breakpoints_lenient:
lenient_vcf_string += bp.as_vcf(ref_fasta)
with open(os.path.join(outdir, "somatic.sv_breakpoints.lenient.vcf"), 'w') as output:
with open(os.path.join(outdir, f'{args.sample}.somatic.sv_breakpoints.lenient.vcf'), 'w') as output:
output.write(lenient_vcf_string)
# output strict vcf
strict_vcf_string = helper.generate_vcf_header(args.ref, args.ref_index, args.tumour, validated_breakpoints[0])
for bp in somatic_breakpoints_strict:
strict_vcf_string += bp.as_vcf(ref_fasta)
with open(os.path.join(outdir, "somatic.sv_breakpoints.strict.vcf"), 'w') as output:
with open(os.path.join(outdir, f'{args.sample}.somatic.sv_breakpoints.strict.vcf'), 'w') as output:
output.write(strict_vcf_string)

if args.debug:
Expand Down Expand Up @@ -227,6 +228,7 @@ def main():
parser.add_argument('--depth', nargs='?', type=int, default=3, help='Threshold for number of supporting reads (default=3)')
parser.add_argument('--threads', nargs='?', type=int, const=0, help='Number of threads to use (default=max)')
parser.add_argument('--outdir', nargs='?', required=True, help='Output directory (can exist but must be empty)')
parser.add_argument('--sample', nargs='?', type=str, help="Name to prepend to output files (default=tumour BAM filename without extension)")
parser.add_argument('--debug', action='store_true', help='Output extra debugging info and files')
parser.add_argument('--validation', nargs='?', type=str, required=False, help='VCF file to use as validation (optional)')
parser.add_argument('--version', action='version', version=f'SAVANA {helper.__version__}')
Expand All @@ -237,6 +239,11 @@ def main():
src_location = __file__
print(f'Source: {src_location}\n')

# set sample name to default if req.
if not args.sample:
args.sample = os.path.splitext(os.path.basename(args.tumour))[0]
print(f'Running as sample {args.sample}')

# create output dir if it doesn't exist
outdir = os.path.join(os.getcwd(), args.outdir)
if not os.path.exists(outdir):
Expand Down Expand Up @@ -278,7 +285,7 @@ def main():
calculate_cluster_stats(consensus_clusters, outdir)

# validate strict
output_vcf = os.path.join(outdir, 'somatic.sv_breakpoints.strict.vcf')
output_vcf = os.path.join(outdir, f'{args.sample}.somatic.sv_breakpoints.strict.vcf')
if args.validation:
try:
validation.validate_vcf(outdir, output_vcf, args.validation, 'strict')
Expand All @@ -287,7 +294,7 @@ def main():
print(f'You can retry by running "python savana/validation.py --outdir testing --input {output_vcf} --validation {args.validation}"')

# validate lenient
output_vcf = os.path.join(outdir, 'somatic.sv_breakpoints.lenient.vcf')
output_vcf = os.path.join(outdir, f'{args.sample}.somatic.sv_breakpoints.lenient.vcf')
if args.validation:
try:
validation.validate_vcf(outdir, output_vcf, args.validation, 'lenient')
Expand Down
3 changes: 1 addition & 2 deletions savana/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,7 @@ def main():
if not os.path.exists(args.validation):
sys.exist(f'Provided validation vcf: "{args.validation}" does not exist. Please provide full path')

validate_vcf(outdir, args.input, args.validation, 'strict')
validate_vcf(outdir, args.input, args.validation, 'lenient')
validate_vcf(outdir, args.input, args.validation, 'custom')

if __name__ == "__main__":
main()

0 comments on commit e1aa147

Please sign in to comment.