From d7eec19d2c777bc4786f47b833fef92b5b5b2856 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Fri, 13 Dec 2024 10:36:04 -0700 Subject: [PATCH] fix --- .github/workflows/ci.yml | 2 +- .github/workflows/readthedocs.yaml | 2 +- bwapy/libbwapy.pyi | 4 +- bwapy/libbwapy.pyx | 114 ++++++++++++++++------------- tests/test_bwapy.py | 2 +- 5 files changed, 69 insertions(+), 55 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3db2631..0e8fdf3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -76,7 +76,7 @@ jobs: - name: Run pytest shell: bash run: | - poetry run python -m pytest --cov=fgpyo --cov-report=xml --cov-branch + poetry run python -m pytest --cov=bwapy --cov-report=xml --cov-branch - name: Run docs shell: bash diff --git a/.github/workflows/readthedocs.yaml b/.github/workflows/readthedocs.yaml index a960af4..7625296 100644 --- a/.github/workflows/readthedocs.yaml +++ b/.github/workflows/readthedocs.yaml @@ -17,4 +17,4 @@ jobs: steps: - uses: readthedocs/actions/preview@v1 with: - project-slug: "fgpyo" + project-slug: "bwapy" diff --git a/bwapy/libbwapy.pyi b/bwapy/libbwapy.pyi index 95bd8ee..7daaf1c 100644 --- a/bwapy/libbwapy.pyi +++ b/bwapy/libbwapy.pyi @@ -18,4 +18,6 @@ class BwaIndex: class Bwa: def __init__(self, prefix: str) -> None: ... - def align(self, opt: BwaOptions, max_hits: int, query: FastxRecord) -> List[AlignedSegment]: ... + def align( + self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord] + ) -> List[AlignedSegment]: ... diff --git a/bwapy/libbwapy.pyx b/bwapy/libbwapy.pyx index 6cf9ecd..34cb548 100644 --- a/bwapy/libbwapy.pyx +++ b/bwapy/libbwapy.pyx @@ -101,14 +101,21 @@ cdef class BwaIndex: cdef class Bwa: cdef BwaIndex _index + cdef unsigned char* _pacseq def __init__(self, prefix: str): assert Path(prefix).exists() self._index = BwaIndex(prefix=prefix) bwase_initialize() - def align(self, opt: BwaOptions, max_hits: int, query: FastxRecord) -> List[AlignedSegment]: - return self._calign(opt, max_hits, query) + # initialize the packed binary reference sequence + self._pacseq = calloc(self._index._bns.l_pac//4+1, 1) + err_fseek(self._index._bns.fp_pac, 0, SEEK_SET) + err_fread_noeof(self._pacseq, 1, self._index._bns.l_pac//4+1, self._index._bns.fp_pac) + + # TODO: a list of records... + def align(self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord]) -> List[AlignedSegment]: + return self._calign(opt, max_hits, queries) cdef _copy_seq(self, q: FastxRecord, bwa_seq_t* s): seq_len = len(q.sequence) @@ -132,21 +139,15 @@ cdef class Bwa: strncpy(s.name, force_bytes(q.name), len(q.name)) s.name[len(q.name)] = b'\0' - cdef _calign(self, opt: BwaOptions, max_hits: int, query: FastxRecord): - queries = [query] + cdef _calign(self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord]): cdef bwa_seq_t* seqs cdef bwa_seq_t* s cdef char* s_char cdef kstring_t* kstr cdef int nm - cdef unsigned char *pacseq cdef int reference_id - # initialize the packed binary reference sequence kstr = calloc(sizeof(kstring_t), 1) - pacseq = calloc(self._index._bns.l_pac//4+1, 1) - err_fseek(self._index._bns.fp_pac, 0, SEEK_SET) - err_fread_noeof(pacseq, 1, self._index._bns.l_pac//4+1, self._index._bns.fp_pac) # copy FastqProxy into bwa_seq_t num_seqs = len(queries) @@ -156,6 +157,7 @@ cdef class Bwa: seqs[i].tid = -1 # this is `bwa aln`, and the rest is `bwa samse` + print("[bwa_cal_sa_reg_gap]") bwa_cal_sa_reg_gap(0, self._index._bwt, num_seqs, seqs, opt._delegate) # create the full alignment @@ -163,15 +165,16 @@ cdef class Bwa: s = &seqs[i] # bwa_cal_sa_reg_gap frees name, seq, rseq, and qual, so add them back in again self._copy_seq(queries[i], s) + print(f"[bwa_aln2seq_core] {i}") bwa_aln2seq_core(s.n_aln, s.aln, s, 1, max_hits) # calculate the genomic position given the suffix array offsite - for i in range(num_seqs): - s = &seqs[i] + print("[bwa_cal_pac_pos_with_bwt]") bwa_cal_pac_pos_with_bwt(self._index._bns, num_seqs, seqs, opt._delegate.max_diff, opt._delegate.fnr, self._index._bwt) # TODO: can `_pacseq` be passed here? # refine gapped alignment + print("[bwa_refine_gapped]") bwa_refine_gapped(self._index._bns, num_seqs, seqs, NULL) # create the AlignedSegment @@ -181,6 +184,7 @@ cdef class Bwa: q = queries[i] # make a default, unmapped, empty record + print("[AlignedSegment]") rec = AlignedSegment(header=self._index.header) rec.query_name = q.name rec.reference_id = -1 @@ -197,48 +201,56 @@ cdef class Bwa: rec.query_sequence = q.sequence rec.query_qualities = q.quality - if s.type != BWA_TYPE_NO_MATCH: # mapped read - ref_len_in_alignment = pos_end(s) - s.pos - - # if on the reverse strand, reverse the query sequence and qualities - rec.query_sequence = q.sequence[::-1] - if q.quality is not None: - rec.query_qualities = q.quality[::-1] - - # reference id - nn = bns_cnt_ambi(self._index._bns, s.pos, ref_len_in_alignment, &reference_id) - rec.reference_id = reference_id - - # make this unmapped if we map off the end of the contig - rec.is_unmapped = s.pos + ref_len_in_alignment - self._index._bns.anns[rec.reference_id].offset > self._index._bns.anns[rec.reference_id].len - - # strand, reference start, and mapping quality - if s.strand: - rec.is_reverse = True - rec.reference_start = s.pos - self._index._bns.anns[rec.reference_id].offset + 1 - rec.mapping_quality = s.mapQ - - # cigar - cigar = "" - if s.cigar: - for j in range(s.n_cigar): - cigar_len = __cigar_len(s.cigar[j]) - cigar_op = "MIDS"[__cigar_op(s.cigar[j])] - cigar = f"{cigar}{cigar_len}{cigar_op}" - elif s.type != BWA_TYPE_NO_MATCH: - cigar = f"{s.len}M" - - # tags - attrs = dict() - if s.type != BWA_TYPE_NO_MATCH: - attrs["MD"] = bwa_cal_md1(s.n_cigar, s.cigar, s.len, s.pos, s.rseq if s.strand else s.seq, self._index._bns.l_pac, pacseq, kstr, &nm) - rec.set_tags(list(attrs.items())) - # TODO: tags - recs.append(rec) + + if s.type == BWA_TYPE_NO_MATCH: # umapped read + continue + + print("[AlignedSegment] aligned") + ref_len_in_alignment = pos_end(s) - s.pos + + # if on the reverse strand, reverse the query sequence and qualities + rec.query_sequence = q.sequence[::-1] + if q.quality is not None: + rec.query_qualities = q.quality[::-1] + + # reference id + nn = bns_cnt_ambi(self._index._bns, s.pos, ref_len_in_alignment, &reference_id) + rec.reference_id = reference_id + + # make this unmapped if we map off the end of the contig + rec.is_unmapped = s.pos + ref_len_in_alignment - self._index._bns.anns[rec.reference_id].offset > self._index._bns.anns[rec.reference_id].len + + # strand, reference start, and mapping quality + if s.strand: + rec.is_reverse = True + rec.reference_start = s.pos - self._index._bns.anns[rec.reference_id].offset + 1 + rec.mapping_quality = s.mapQ + + # cigar + cigar = "" + if s.cigar: + for j in range(s.n_cigar): + cigar_len = __cigar_len(s.cigar[j]) + cigar_op = "MIDS"[__cigar_op(s.cigar[j])] + cigar = f"{cigar}{cigar_len}{cigar_op}" + elif s.type != BWA_TYPE_NO_MATCH: + cigar = f"{s.len}M" + + # tags + attrs = dict() + if s.type != BWA_TYPE_NO_MATCH: + attrs["MD"] = bwa_cal_md1(s.n_cigar, s.cigar, s.len, s.pos, s.rseq if s.strand else s.seq, self._index._bns.l_pac, self._pacseq, kstr, &nm) + rec.set_tags(list(attrs.items())) + # TODO: tags + + print("[bwa_free_read_seq]") bwa_free_read_seq(num_seqs, seqs) free(kstr) - free(pacseq) - return recs \ No newline at end of file + print("[done]") + return recs + + def __dealloc__(self) -> None: + free(self._pacseq) \ No newline at end of file diff --git a/tests/test_bwapy.py b/tests/test_bwapy.py index f651ee0..6e23367 100644 --- a/tests/test_bwapy.py +++ b/tests/test_bwapy.py @@ -14,7 +14,7 @@ def test_bwapy() -> None: sequence = "gttacctgccgtgagtaaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcgcacagac" fastqs = [FastxRecord(name="test", sequence=sequence)] - recs = bwa.align(opt=opt, max_hits=1, query=fastqs[0]) + recs = bwa.align(opt=opt, max_hits=1, queries=fastqs) assert len(recs) == 1 rec = recs[0] assert rec.query_name == "test"