Skip to content

Commit

Permalink
Merge pull request #45 from PacificBiosciences/bugfix/phase_filter
Browse files Browse the repository at this point in the history
updates for v1.4.4
  • Loading branch information
holtjma authored Aug 15, 2024
2 parents 52ab5c0 + b97d6db commit a80258e
Show file tree
Hide file tree
Showing 11 changed files with 252 additions and 8 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# 1.4.3
# v1.4.4
## Fixed
* Fixed an error where phasing information that was present in input files would be copied through to output files if it was not overwritten by HiPhase phasing results. HiPhase will now automatically remove this phasing information to prevent accidental mixing of phase results.
* For VCF files, any unphased genotypes will be switched to unphased and sorted by allele index (e.g. 1|0 -> 0/1). The "FORMAT:PS" and "FORMAT:PF" tags will either be removed entirely if the whole record is unphased or set to "." for partially phased records.
* For BAM files, the "HP" and "PS" tags will be removed for any unphased records.

# v1.4.3
## Fixed
* Replaced a panic caused by a chromosome appearing in a VCF but not in the BAM file with a more descriptive error message
* Fixed an error caused by a multi-sample VCF with a mixture of haploid and diploid genotypes
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "hiphase"
version = "1.4.3"
version = "1.4.4"
authors = ["J. Matthew Holt <[email protected]>"]
description = "A tool for jointly phasing small, structural, and tandem repeat variants for PacBio sequencing data"
edition = "2021"
Expand Down
2 changes: 1 addition & 1 deletion LICENSE-THIRDPARTY.json
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@
},
{
"name": "hiphase",
"version": "1.4.3",
"version": "1.4.4",
"authors": "J. Matthew Holt <[email protected]>",
"repository": null,
"license": null,
Expand Down
6 changes: 6 additions & 0 deletions docs/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -337,3 +337,9 @@ Since TRGT calls often correspond to multiple small variant calls, the total num
Additionally, this has the benefit of removing false heterozygous variants in the tandem repeat regions.
This can have the side effect of (correctly) reducing block NG50 when those false variants are the only links between two otherwise unlinked phase blocks.
Variants that are intentionally excluded from phasing this way will have a FORMAT `PF` tag of `TR_OVERLAP`.

## Can I provide pre-existing phase information to HiPhase?
Currently no, HiPhase will not use any pre-existing phase information in the provided VCF or BAM files.
To prevent accidental mis-interpretation of results, HiPhase will remove or overwrite any existing phase information from the input files prior to writing the output phased VCFs and/or BAMs.
For VCF files, this includes the "FORMAT:GT" (genotype), "FORMAT:PS" (phase set), and "FORMAT:PF" (phase filter) fields.
For BAM files, this includes the "HP" (haplotype) and "PS" (phase set) tags.
67 changes: 64 additions & 3 deletions src/writers/ordered_bam_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,8 @@ impl OrderedBamWriter {
bam_writer.write(&record)?;
},
None => {
// no match, so just copy the read over
// no match, so just copy the read over after stripping any phase information
strip_record_phasing(&mut record)?;
bam_writer.write(&record)?;
}
};
Expand Down Expand Up @@ -277,7 +278,7 @@ impl OrderedBamWriter {
Ok(()) => {
for record_result in bam_reader.records() {
// set up the record for the new BAM file
let record = record_result?;
let mut record = record_result?;
let record_pos = record.pos();
if record_pos < start_pos as i64 {
// this can happen when you have reads that overlap the location but don't start *after* the position
Expand All @@ -286,6 +287,7 @@ impl OrderedBamWriter {
}

// nothing left should be tagged
strip_record_phasing(&mut record)?;
bam_writer.write(&record)?;

// adding this last bit should prevent double writes by accident from a user
Expand Down Expand Up @@ -336,9 +338,10 @@ impl OrderedBamWriter {
Ok(()) => {
for record_result in bam_reader.records() {
// set up the record for the new BAM file
let record = record_result?;
let mut record = record_result?;

// nothing left should be tagged
strip_record_phasing(&mut record)?;
bam_writer.write(&record)?;
}
},
Expand All @@ -353,3 +356,61 @@ impl OrderedBamWriter {
Ok(())
}
}

/// Removes the phase tags that HiPhase will normally write from a record.
/// This is so we erase anything that did not come from the tool.
fn strip_record_phasing(record: &mut bam::Record) -> Result<(), rust_htslib::errors::Error> {
// much simpler than the VCF version since there is no multi-sample to worry about, just drop these tags
match record.remove_aux(b"HP") {
// either it was remove or was not there to begin with
Ok(()) |
Err(rust_htslib::errors::Error::BamAuxTagNotFound) => {},
// some other crazy error
Err(e) => {
return Err(e);
}
};
match record.remove_aux(b"PS") {
// either it was remove or was not there to begin with
Ok(()) |
Err(rust_htslib::errors::Error::BamAuxTagNotFound) => Ok(()),
// some other crazy error
Err(e) => Err(e)
}
}

#[cfg(test)]
mod tests {
use super::*;

/// Simple empty bam record creator; if we just use a "new()" record, htslib is unhappy when we access Aux
fn dummy_bam_record() -> bam::Record {
let mut record = bam::Record::new();
record.set(
b"test_qname",
None,
b"A",
&[255]
);
record
}

#[test]
fn test_clear_record() {
// make sure running on a record without the tags is fine (this will be *most* cases)
let mut record = dummy_bam_record();
strip_record_phasing(&mut record).unwrap();

// create a record with the tags
let mut record = dummy_bam_record();
record.push_aux(b"HP", bam::record::Aux::U8(1)).unwrap();
record.push_aux(b"PS", bam::record::Aux::I32(42)).unwrap();
assert!(record.aux(b"HP").is_ok());
assert!(record.aux(b"PS").is_ok());

// strip them and verify they are gone
strip_record_phasing(&mut record).unwrap();
assert_eq!(record.aux(b"HP").err().unwrap(), rust_htslib::errors::Error::BamAuxTagNotFound);
assert_eq!(record.aux(b"PS").err().unwrap(), rust_htslib::errors::Error::BamAuxTagNotFound);
}
}
148 changes: 147 additions & 1 deletion src/writers/ordered_vcf_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ impl OrderedVcfWriter {

//now setup the outputs, we want to do the header stuff here also
let mut output_header: bcf::header::Header = bcf::header::Header::from_template(&vcf_header);

// we can remove any existing headers without crashing anything
output_header.remove_format(b"PS");
output_header.remove_format(b"PF");

// add in the new headers
let cli_string: String = std::env::args().collect::<Vec<String>>().join(" ");
let cli_version: &str = &crate::cli::FULL_VERSION;
output_header.push_record(format!(r#"##HiPhase_version="{cli_version}""#).as_bytes());
Expand Down Expand Up @@ -308,12 +314,16 @@ impl OrderedVcfWriter {
// we have already written though, so don't write it again
continue;
}

// the record will get written, clear out all phasing information first
strip_record_phasing(&mut record)?;

// we now have to iterate over each sample and modify the entries as necessary
let vcf_sample_indices = &self.sample_indices[vcf_index];
let mut changes_made: bool = false;

// initialize the alleles array to be the same, these may change inside the loop
// initialize the alleles array to be the same as what's currently in the record; this is post-phase stripping
// alleles may get overwritten following this loop depending on the success of the phasing
let mut alleles = vec![];
let record_gt = record.genotypes().unwrap();
for si in 0..record.sample_count() {
Expand Down Expand Up @@ -422,3 +432,139 @@ impl OrderedVcfWriter {
Ok(())
}
}

/// This will remove all phasing information from a given VCF record.
/// # Arguments
/// * `record` - the VCF to strip all phasing from
/// # Errors
/// * if it fails to clear the PS tag
/// * if it fails to parse genotypes
/// * if it encounters any weird genotypes (we probably would fail before this)
fn strip_record_phasing(record: &mut bcf::Record) -> Result<(), Box<dyn std::error::Error>> {
// first remove the "PS" tag
clear_format_tag(record, b"PS")?;
clear_format_tag(record, b"PF")?;

let record_gt = record.genotypes()?;
let mut alleles = vec![];
for si in 0..record.sample_count() {
let genotype = record_gt.get(si as usize);
match genotype.len() {
0 => bail!("Encountered empty genotype record at position {}", record.pos()),
1 => {
// TRGT can make single-allele GT calls, just copy it over as normal
// it will not be modified below because it is a homozygous allele
let g0 = unphase_genotype_allele(genotype[0]);
alleles.push(i32::from(g0));
alleles.push(i32::MIN+1); // sentinel value for end
},
2 => {
// this is 99.99999999% path
let g0 = i32::from(unphase_genotype_allele(genotype[0]));
let g1 = i32::from(unphase_genotype_allele(genotype[1]));
let min_value = g0.min(g1);
let max_value = g0.max(g1);
alleles.push(min_value);
alleles.push(max_value);
},
gt_len => {
// we do not have 3+ GT entries implemented
bail!("Encountered GT of length {} at record {}", gt_len, record.desc())
}
}
}
record.push_format_integer(b"GT", &alleles)?;

Ok(())
}

/// This will remove a specified format field from the provided record.
/// Other approaches seem to set it to "." instead of removing the field entirely.
/// If the tag is absent (from this record or the entire file), this seemingly returns without error.
/// # Arguments
/// * `record` - the record to remove the tag from
/// * `tag` - the tag to remove, e.g. b"PS"
/// # Errors
/// * if we cannot remove the tag for some reason
fn clear_format_tag(record: &mut bcf::Record, tag: &[u8]) -> Result<(), rust_htslib::errors::Error> {
let tag_c_str = std::ffi::CString::new(tag).unwrap();
unsafe {
if rust_htslib::htslib::bcf_update_format(
record.header().inner,
record.inner,
tag_c_str.into_raw(),
std::ptr::null() as *const ::std::os::raw::c_void,
0,
rust_htslib::htslib::BCF_HT_INT as i32
) == 0 {
Ok(())
} else {
Err(rust_htslib::errors::Error::BcfSetTag { tag: std::str::from_utf8(tag).unwrap().to_owned() })
}
}
}

/// This will convert a genotype allele into an unphased version.
/// # Arguments
/// * `gt` - the genotype allele to unphase
fn unphase_genotype_allele(gt: GenotypeAllele) -> GenotypeAllele {
match gt {
GenotypeAllele::Unphased(i) |
GenotypeAllele::Phased(i) => GenotypeAllele::Unphased(i),
GenotypeAllele::UnphasedMissing |
GenotypeAllele::PhasedMissing => GenotypeAllele::UnphasedMissing
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_clear_record() {
let vcf_fn = PathBuf::from("./test_data/prephased_test/prephased.vcf");
let mut vcf_reader = bcf::Reader::from_path(&vcf_fn).unwrap();

// expectations for sample 1
let expected_gt1 = vec![
vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)],
vec![GenotypeAllele::Unphased(1)],
vec![GenotypeAllele::UnphasedMissing, GenotypeAllele::Unphased(1)]
];

// expectations for sample 2
let expected_gt2 = vec![
vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(0)],
vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(2)],
vec![GenotypeAllele::UnphasedMissing, GenotypeAllele::Unphased(0)]
];

for (record_result, (exp_gt1, exp_gt2)) in vcf_reader.records()
.zip(expected_gt1.iter().zip(expected_gt2.iter())) {
// load the record and strip out the phasing information
let mut record = record_result.unwrap();
println!("Handling {}", record.desc());
strip_record_phasing(&mut record).unwrap();

// verify that PS tag is completely gone
let ps_tag_result = record.format(b"PS").integer();
assert!(ps_tag_result.is_err());
assert_eq!(ps_tag_result.err().unwrap(), rust_htslib::errors::Error::BcfMissingTag { tag: "PS".to_string(), record: record.desc() });

// verify that PF tag is completely gone
let pf_tag_result = record.format(b"PF").string();
assert!(pf_tag_result.is_err());
assert_eq!(pf_tag_result.err().unwrap(), rust_htslib::errors::Error::BcfMissingTag { tag: "PF".to_string(), record: record.desc() });

// make sure GT 1 matches our expectation, which will be unphased
let genotype1 = record.genotypes().unwrap().get(0);
assert_eq!(genotype1.as_slice(), exp_gt1.as_slice());

// make sure GT 2 matches our expectation, which will be unphased
let genotype2 = record.genotypes().unwrap().get(1);
assert_eq!(genotype2.as_slice(), exp_gt2.as_slice());
}
}
}
Binary file added test_data/iupac_test/small_variants.vcf.gz
Binary file not shown.
Binary file added test_data/iupac_test/small_variants.vcf.gz.tbi
Binary file not shown.
2 changes: 2 additions & 0 deletions test_data/iupac_test/test_reference.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>chr1
ACGTARGTACGT
23 changes: 23 additions & 0 deletions test_data/prephased_test/prephased.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=RefCall,Description="Genotyping model thinks this site is reference.">
##FILTER=<ID=LowQual,Description="Confidence in this variant being real is below calling threshold.">
##FILTER=<ID=NoCall,Description="Site has depth=0 resulting in no call.">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position (for use with symbolic alleles)">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block.">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=VAF,Number=A,Type=Float,Description="Variant allele fractions.">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods rounded to the closest integer">
##FORMAT=<ID=MED_DP,Number=1,Type=Integer,Description="Median DP observed within the GVCF block rounded to the nearest integer.">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set identifier">
##FORMAT=<ID=PF,Number=1,Type=String,Description="Phasing flag">
##DeepVariant_version=1.3.0
##contig=<ID=chr1,length=12>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_name sample_name2
chr1 5 . A T 30 PASS . GT:PS:PF 0|1:5:test 1|0:5:.
chr1 15 . A T 30 PASS . GT:PS 1|1:5 0|0:5
chr1 25 . A T,C 30 PASS . GT:PS 1:5 2|1:5
chr1 30 . A T 30 PASS . GT:PS .|1:5 0|.:5

0 comments on commit a80258e

Please sign in to comment.