Skip to content

Commit

Permalink
WIP: write_vcf
Browse files Browse the repository at this point in the history
  • Loading branch information
szhan committed Feb 25, 2024
1 parent 2979a70 commit cc50ac5
Showing 1 changed file with 53 additions and 1 deletion.
54 changes: 53 additions & 1 deletion python/tests/beagle_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,11 +667,25 @@ def compute_alt_allele_frequencies():
pass


def write_vcf(out_file):
def write_vcf(ref_ts, imputed_alleles, imputed_allele_probs, out_file, chr_name="1"):
"""
n = number of diploid individuals
h = number of query haplotypes in the individuals (2 * n)
x = number of imputed sites
:param tskit.TreeSequence ref_ts: Tree sequence with reference haplotypes.
:param numpy.ndarray imputed_site_pos: Physical positions of imputed sites.
:param numpy.ndarray imputed_alleles: Imputed alleles of query haplotypes.
:param numpy.ndarray imputed_allele_probs: Imputed allele probabilities of queries.
:param str out_file: Path to output VCF file.
:param str chr_name: Chromosome name.
:return: None
"""
assert imputed_alleles.shape == imputed_allele_probs.shape
assert imputed_alleles.shape % 2 == 0, "Even number of haplotypes is expected."
h = imputed_alleles.shape[0] # Number of query haplotypes
n = h / 2 # Number of diploid individuals
x = imputed_alleles.shape[1] # Number of imputed sites.
_HEADER = [
"##fileformat=VCFv4.2",
"##INFO=<ID=AF,Number=A,Type=Float,"
Expand All @@ -691,6 +705,44 @@ def write_vcf(out_file):
"##FORMAT=<ID=GP,Number=G,Type=Float,"
+ 'Description="Estimated Genotype Probability">',
]
_COLUMN_NAMES = [
"#",
"CHROM",
"POS",
"ID",
"REF",
"ALT",
"QUAL",
"FILTER",
"INFO",
"FORMAT",
]
with open(out_file, "w") as f:
for line in _HEADER:
f.write(line + "\n")
for col_name in _COLUMN_NAMES:
f.write(col_name + "\t")
for i in range(x):
gt_probs, dosages = compute_individual_scores(
imputed_alleles[i],
imputed_allele_probs[i],
imputed_alleles[i + 1],
imputed_allele_probs[i + 1],
)
ar2 = compute_estimated_allelic_r_squared(gt_probs)
# QUAL
# qual_str = "."
# FILTER
# filter_str = "PASS"
# INFO
dr2 = 0
af = 0
info_str = f"AR2{ar2};DR2{dr2};AF{af}"
# FORMAT
format_str = ""
for j in range(n):
format_str += imputed_alleles[j] + "|" + imputed_alleles[j + 1] + ":"
format_str += gt_probs[j] + ":"
format_str += dosages[j]
print(info_str)
print(format_str)

0 comments on commit cc50ac5

Please sign in to comment.