diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e53bd31..8e0eca3 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -35,7 +35,6 @@ jobs: - name: Set up miniconda uses: conda-incubator/setup-miniconda@v3 with: - miniforge-variant: Mambaforge miniforge-version: latest channels: conda-forge,bioconda activate-environment: prymer diff --git a/prymer/offtarget/bwa.py b/prymer/offtarget/bwa.py index b266e2f..dbdbb19 100644 --- a/prymer/offtarget/bwa.py +++ b/prymer/offtarget/bwa.py @@ -156,8 +156,10 @@ def __str__(self) -> str: @dataclass(init=True, frozen=True) class BwaResult: - """Represents zero or more hits or alignments found by BWA for the given query. The number of - hits found may be more than the number of hits listed in the `hits` attribute. + """ + Represents zero or more hits or alignments found by BWA for the given query. + + The number of hits found may be more than the number of hits listed in the `hits` attribute. Attributes: query: the query (as given, no RC'ing) @@ -346,7 +348,8 @@ def map_all(self, queries: list[Query]) -> list[BwaResult]: return results def _to_result(self, query: Query, rec: pysam.AlignedSegment) -> BwaResult: - """Converts the query and alignment to a result. + """ + Convert the query and alignment to a result. Args: query: the original query @@ -357,17 +360,20 @@ def _to_result(self, query: Query, rec: pysam.AlignedSegment) -> BwaResult: "Query and Results are out of order" f"Query=${query.id}, Result=${rec.query_name}" ) - num_hits: Optional[int] = int(rec.get_tag("HN")) if rec.has_tag("HN") else None - if rec.is_unmapped: - if num_hits is not None and num_hits > 0: - raise ValueError(f"Read was unmapped but num_hits > 0: {rec}") - return BwaResult(query=query, hit_count=0, hits=[]) - elif num_hits > self.max_hits: - return BwaResult(query=query, hit_count=num_hits, hits=[]) - else: + num_hits: int = int(rec.get_tag("HN")) if rec.has_tag("HN") else 0 + # `to_hits()` removes artifactual hits which span the boundary between concatenated + # reference sequences. If we are reporting a list of hits, the number of hits should match + # the size of this list. Otherwise, we either have zero hits, or more than the maximum + # number of hits. In both of the latter cases, we have to rely on the count reported in the + # `HN` tag. + hits: list[BwaHit] + if 0 < num_hits < self.max_hits: hits = self.to_hits(rec=rec) - hit_count = num_hits if len(hits) == 0 else len(hits) - return BwaResult(query=query, hit_count=hit_count, hits=hits) + num_hits = len(hits) + else: + hits = [] + + return BwaResult(query=query, hit_count=num_hits, hits=hits) def to_hits(self, rec: AlignedSegment) -> list[BwaHit]: """Extracts the hits from the given alignment. Beyond the current alignment @@ -412,4 +418,9 @@ def to_hits(self, rec: AlignedSegment) -> list[BwaHit]: if not self.include_alt_hits: hits = [hit for hit in hits if not hit.refname.endswith("_alt")] + # Remove hits that extend beyond the end of a contig - these are artifacts of `bwa aln`'s + # alignment process, which concatenates all reference sequences and reports hits which span + # across contigs. + hits = [hit for hit in hits if hit.end <= self.header.get_reference_length(hit.refname)] + return hits