fix

fulcrumgenomics · Dec 13, 2024 · d7eec19 · d7eec19
1 parent 9a5d4c9
commit d7eec19
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 55 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -76,7 +76,7 @@ jobs:
     - name: Run pytest
       shell: bash
       run: |
-        poetry run python -m pytest --cov=fgpyo --cov-report=xml --cov-branch 
+        poetry run python -m pytest --cov=bwapy --cov-report=xml --cov-branch 
 
     - name: Run docs
       shell: bash

diff --git a/.github/workflows/readthedocs.yaml b/.github/workflows/readthedocs.yaml
@@ -17,4 +17,4 @@ jobs:
     steps:
       - uses: readthedocs/actions/preview@v1
         with:
-          project-slug: "fgpyo"
+          project-slug: "bwapy"
diff --git a/bwapy/libbwapy.pyi b/bwapy/libbwapy.pyi
@@ -18,4 +18,6 @@ class BwaIndex:
 
 class Bwa:
     def __init__(self, prefix: str) -> None: ...
-    def align(self, opt: BwaOptions, max_hits: int, query: FastxRecord) -> List[AlignedSegment]: ...
+    def align(
+        self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord]
+    ) -> List[AlignedSegment]: ...
diff --git a/bwapy/libbwapy.pyx b/bwapy/libbwapy.pyx
@@ -101,14 +101,21 @@ cdef class BwaIndex:
 cdef class Bwa:
 
     cdef BwaIndex _index
+    cdef unsigned char* _pacseq
 
     def __init__(self, prefix: str):
         assert Path(prefix).exists()
         self._index = BwaIndex(prefix=prefix)
         bwase_initialize()
 
-    def align(self, opt: BwaOptions, max_hits: int, query:  FastxRecord) -> List[AlignedSegment]:
-        return self._calign(opt, max_hits, query)
+        # initialize the packed binary reference sequence
+        self._pacseq = <unsigned char*>calloc(self._index._bns.l_pac//4+1, 1)
+        err_fseek(self._index._bns.fp_pac, 0, SEEK_SET)
+        err_fread_noeof(self._pacseq, 1, self._index._bns.l_pac//4+1, self._index._bns.fp_pac)
+
+    # TODO: a list of records...
+    def align(self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord]) -> List[AlignedSegment]:
+        return self._calign(opt, max_hits, queries)
 
     cdef _copy_seq(self, q: FastxRecord, bwa_seq_t* s):
         seq_len = len(q.sequence)
@@ -132,21 +139,15 @@ cdef class Bwa:
         strncpy(s.name, force_bytes(q.name), len(q.name))
         s.name[len(q.name)] = b'\0'
 
-    cdef _calign(self, opt: BwaOptions, max_hits: int, query: FastxRecord):
-        queries = [query]
+    cdef _calign(self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord]):
         cdef bwa_seq_t* seqs
         cdef bwa_seq_t* s
         cdef char* s_char
         cdef kstring_t* kstr
         cdef int nm
-        cdef unsigned char *pacseq
         cdef int reference_id
 
-        # initialize the packed binary reference sequence
         kstr = <kstring_t*>calloc(sizeof(kstring_t), 1)
-        pacseq = <unsigned char*>calloc(self._index._bns.l_pac//4+1, 1)
-        err_fseek(self._index._bns.fp_pac, 0, SEEK_SET)
-        err_fread_noeof(pacseq, 1, self._index._bns.l_pac//4+1, self._index._bns.fp_pac)
 
         # copy FastqProxy into bwa_seq_t
         num_seqs = len(queries)
@@ -156,22 +157,24 @@ cdef class Bwa:
             seqs[i].tid = -1
 
         # this is `bwa aln`, and the rest is `bwa samse`
+        print("[bwa_cal_sa_reg_gap]")
         bwa_cal_sa_reg_gap(0, self._index._bwt, num_seqs, seqs, opt._delegate)
 
         # create the full alignment
         for i in range(num_seqs):
             s = &seqs[i]
             # bwa_cal_sa_reg_gap frees name, seq, rseq, and qual, so add them back in again
             self._copy_seq(queries[i], s)
+            print(f"[bwa_aln2seq_core] {i}")
             bwa_aln2seq_core(s.n_aln, s.aln, s, 1, max_hits)
 
         # calculate the genomic position given the suffix array offsite
-        for i in range(num_seqs):
-            s = &seqs[i]
+        print("[bwa_cal_pac_pos_with_bwt]")
         bwa_cal_pac_pos_with_bwt(self._index._bns, num_seqs, seqs, opt._delegate.max_diff, opt._delegate.fnr, self._index._bwt)
 
         # TODO: can `_pacseq` be passed here?
         # refine gapped alignment
+        print("[bwa_refine_gapped]")
         bwa_refine_gapped(self._index._bns, num_seqs, seqs, NULL)
 
         # create the AlignedSegment
@@ -181,6 +184,7 @@ cdef class Bwa:
             q = queries[i]
 
             # make a default, unmapped, empty record
+            print("[AlignedSegment]")
             rec = AlignedSegment(header=self._index.header)
             rec.query_name = q.name
             rec.reference_id = -1
@@ -197,48 +201,56 @@ cdef class Bwa:
             rec.query_sequence = q.sequence
             rec.query_qualities = q.quality
 
-            if s.type != BWA_TYPE_NO_MATCH:  # mapped read
-                ref_len_in_alignment = pos_end(s) - s.pos
-
-                # if on the reverse strand, reverse the query sequence and qualities
-                rec.query_sequence = q.sequence[::-1]
-                if q.quality is not None:
-                    rec.query_qualities = q.quality[::-1]
-
-                # reference id
-                nn = bns_cnt_ambi(self._index._bns, s.pos, ref_len_in_alignment, &reference_id)
-                rec.reference_id = reference_id
-
-                # make this unmapped if we map off the end of the contig
-                rec.is_unmapped  = s.pos + ref_len_in_alignment - self._index._bns.anns[rec.reference_id].offset > self._index._bns.anns[rec.reference_id].len
-
-                # strand, reference start, and mapping quality
-                if s.strand:
-                    rec.is_reverse = True
-                rec.reference_start = s.pos - self._index._bns.anns[rec.reference_id].offset + 1
-                rec.mapping_quality = s.mapQ
-
-                # cigar
-                cigar = ""
-                if s.cigar:
-                    for j in range(s.n_cigar):
-                        cigar_len = __cigar_len(s.cigar[j])
-                        cigar_op  = "MIDS"[__cigar_op(s.cigar[j])]
-                        cigar = f"{cigar}{cigar_len}{cigar_op}"
-                elif s.type != BWA_TYPE_NO_MATCH:
-                    cigar = f"{s.len}M"
-
-                # tags
-                attrs = dict()
-                if s.type != BWA_TYPE_NO_MATCH:
-                    attrs["MD"] = bwa_cal_md1(s.n_cigar, s.cigar, s.len, s.pos, s.rseq if s.strand else s.seq, self._index._bns.l_pac, pacseq, kstr, &nm)
-                rec.set_tags(list(attrs.items()))
-                # TODO: tags
-
             recs.append(rec)
+
+            if s.type == BWA_TYPE_NO_MATCH:  # umapped read
+                continue
+
+            print("[AlignedSegment] aligned")
+            ref_len_in_alignment = pos_end(s) - s.pos
+
+            # if on the reverse strand, reverse the query sequence and qualities
+            rec.query_sequence = q.sequence[::-1]
+            if q.quality is not None:
+                rec.query_qualities = q.quality[::-1]
+
+            # reference id
+            nn = bns_cnt_ambi(self._index._bns, s.pos, ref_len_in_alignment, &reference_id)
+            rec.reference_id = reference_id
+
+            # make this unmapped if we map off the end of the contig
+            rec.is_unmapped  = s.pos + ref_len_in_alignment - self._index._bns.anns[rec.reference_id].offset > self._index._bns.anns[rec.reference_id].len
+
+            # strand, reference start, and mapping quality
+            if s.strand:
+                rec.is_reverse = True
+            rec.reference_start = s.pos - self._index._bns.anns[rec.reference_id].offset + 1
+            rec.mapping_quality = s.mapQ
+
+            # cigar
+            cigar = ""
+            if s.cigar:
+                for j in range(s.n_cigar):
+                    cigar_len = __cigar_len(s.cigar[j])
+                    cigar_op  = "MIDS"[__cigar_op(s.cigar[j])]
+                    cigar = f"{cigar}{cigar_len}{cigar_op}"
+            elif s.type != BWA_TYPE_NO_MATCH:
+                cigar = f"{s.len}M"
+
+            # tags
+            attrs = dict()
+            if s.type != BWA_TYPE_NO_MATCH:
+                attrs["MD"] = bwa_cal_md1(s.n_cigar, s.cigar, s.len, s.pos, s.rseq if s.strand else s.seq, self._index._bns.l_pac, self._pacseq, kstr, &nm)
+            rec.set_tags(list(attrs.items()))
+            # TODO: tags
+
+        print("[bwa_free_read_seq]")
         bwa_free_read_seq(num_seqs, seqs)
 
         free(kstr)
-        free(pacseq)
 
-        return recs
+        print("[done]")
+        return recs
+
+    def __dealloc__(self) -> None:
+        free(self._pacseq)
diff --git a/tests/test_bwapy.py b/tests/test_bwapy.py
@@ -14,7 +14,7 @@ def test_bwapy() -> None:
     sequence = "gttacctgccgtgagtaaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcgcacagac"
     fastqs = [FastxRecord(name="test", sequence=sequence)]
 
-    recs = bwa.align(opt=opt, max_hits=1, query=fastqs[0])
+    recs = bwa.align(opt=opt, max_hits=1, queries=fastqs)
     assert len(recs) == 1
     rec = recs[0]
     assert rec.query_name == "test"