Skip to content

Commit

Permalink
Generalize the gvcf_combine script by ploidy
Browse files Browse the repository at this point in the history
  • Loading branch information
DonFreed committed Sep 30, 2024
1 parent 2a76d32 commit dce7b5b
Showing 1 changed file with 22 additions and 6 deletions.
28 changes: 22 additions & 6 deletions sentieon_cli/scripts/gvcf_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import vcflib
import copy
import time
import itertools
import re
import resource
import collections
import io
Expand Down Expand Up @@ -59,6 +61,7 @@ class Combiner(vcflib.Shardable, vcflib.ShardResult):

info_vals = {"AC": 0, "MLEAC": 0, "AF": 0, "MLEAF": 0, "RPA": 0}
sample_vals = {"AD": lambda s: 0 if 'DP' not in s else s['DP'] - sum(s['AD'])}
gtpat = re.compile(r'[|/]')

def __init__(self, ref, gvcf_input, vcf_input, gvcf_output):
self.ref = Reference(ref)
Expand All @@ -81,6 +84,9 @@ def extra_headers(vcf1, vcf2):
diffs = filter_diff + info_diff + fmt_diff
return [h for h in vcf2.headers if any([h.startswith(s) for s in diffs])]

def get_ploidy(self, v):
return len(self.gtpat.split(v.samples[0].get("GT", ".")))

def __del__(self):
self.vcf_in.close()
self.gvcf_in.close()
Expand Down Expand Up @@ -135,7 +141,16 @@ def grouper(self, gvcfi, vcfi):
heapq.heappush(q, (vv.pos, vv.end, 0, vv, iters[0]))
g.info = {'END': g.end}
g.qual = None
g.samples[0] = {'GT': '0/0', 'DP': v2.samples[0].get('DP'), 'GQ': 15, 'PL': [0, 15, 99]}
ploidy = self.get_ploidy(v2)
pl, plv = [], 0
for gt in itertools.combinations_with_replacement(range(2), ploidy):
pl.append(plv)
plv += 20
g.samples[0] = {
'GT': '/'.join(['0' for _ in range(ploidy)]),
'DP': v2.samples[0].get('DP'), 'GQ': 15,
'PL': pl,
}
g.line = None
heapq.heappush(q, (g.pos, g.end, 0, g, iters[0]))
else:
Expand Down Expand Up @@ -199,17 +214,18 @@ def combine(self, shard=None):
g.line = None
self.vcftmp.emit(g)
elif pos >= start:
g.samples[0]['GT'] = '0/0'
ploidy = self.get_ploidy(g)
g.samples[0]['GT'] = '/'.join(['0' for _ in range(ploidy)])
g.line = None
self.vcftmp.emit(g)
if v and pos >= start:
v.alt.append('<NON_REF>')
if "PL" in v.samples[0]:
n_alt = len(v.alt)
idx = -1
for i in range(n_alt+1):
idx += n_alt + 1 - i
v.samples[0]['PL'].insert(idx, 100)
ploidy = self.get_ploidy(v)
for idx, gt in enumerate(itertools.combinations_with_replacement(range(n_alt+1), ploidy)):
if n_alt in gt:
v.samples[0]['PL'].insert(idx, 100)
for k in self.remove_info_keys:
if k in v.info:
del v.info[k]
Expand Down

0 comments on commit dce7b5b

Please sign in to comment.