Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 committed Dec 13, 2024
1 parent 9a5d4c9 commit d7eec19
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
- name: Run pytest
shell: bash
run: |
poetry run python -m pytest --cov=fgpyo --cov-report=xml --cov-branch
poetry run python -m pytest --cov=bwapy --cov-report=xml --cov-branch
- name: Run docs
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ jobs:
steps:
- uses: readthedocs/actions/preview@v1
with:
project-slug: "fgpyo"
project-slug: "bwapy"
4 changes: 3 additions & 1 deletion bwapy/libbwapy.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ class BwaIndex:

class Bwa:
def __init__(self, prefix: str) -> None: ...
def align(self, opt: BwaOptions, max_hits: int, query: FastxRecord) -> List[AlignedSegment]: ...
def align(
self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord]
) -> List[AlignedSegment]: ...
114 changes: 63 additions & 51 deletions bwapy/libbwapy.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,21 @@ cdef class BwaIndex:
cdef class Bwa:

cdef BwaIndex _index
cdef unsigned char* _pacseq

def __init__(self, prefix: str):
assert Path(prefix).exists()
self._index = BwaIndex(prefix=prefix)
bwase_initialize()

def align(self, opt: BwaOptions, max_hits: int, query: FastxRecord) -> List[AlignedSegment]:
return self._calign(opt, max_hits, query)
# initialize the packed binary reference sequence
self._pacseq = <unsigned char*>calloc(self._index._bns.l_pac//4+1, 1)
err_fseek(self._index._bns.fp_pac, 0, SEEK_SET)
err_fread_noeof(self._pacseq, 1, self._index._bns.l_pac//4+1, self._index._bns.fp_pac)

# TODO: a list of records...
def align(self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord]) -> List[AlignedSegment]:
return self._calign(opt, max_hits, queries)

cdef _copy_seq(self, q: FastxRecord, bwa_seq_t* s):
seq_len = len(q.sequence)
Expand All @@ -132,21 +139,15 @@ cdef class Bwa:
strncpy(s.name, force_bytes(q.name), len(q.name))
s.name[len(q.name)] = b'\0'

cdef _calign(self, opt: BwaOptions, max_hits: int, query: FastxRecord):
queries = [query]
cdef _calign(self, opt: BwaOptions, max_hits: int, queries: List[FastxRecord]):
cdef bwa_seq_t* seqs
cdef bwa_seq_t* s
cdef char* s_char
cdef kstring_t* kstr
cdef int nm
cdef unsigned char *pacseq
cdef int reference_id

# initialize the packed binary reference sequence
kstr = <kstring_t*>calloc(sizeof(kstring_t), 1)
pacseq = <unsigned char*>calloc(self._index._bns.l_pac//4+1, 1)
err_fseek(self._index._bns.fp_pac, 0, SEEK_SET)
err_fread_noeof(pacseq, 1, self._index._bns.l_pac//4+1, self._index._bns.fp_pac)

# copy FastqProxy into bwa_seq_t
num_seqs = len(queries)
Expand All @@ -156,22 +157,24 @@ cdef class Bwa:
seqs[i].tid = -1

# this is `bwa aln`, and the rest is `bwa samse`
print("[bwa_cal_sa_reg_gap]")
bwa_cal_sa_reg_gap(0, self._index._bwt, num_seqs, seqs, opt._delegate)

# create the full alignment
for i in range(num_seqs):
s = &seqs[i]
# bwa_cal_sa_reg_gap frees name, seq, rseq, and qual, so add them back in again
self._copy_seq(queries[i], s)
print(f"[bwa_aln2seq_core] {i}")
bwa_aln2seq_core(s.n_aln, s.aln, s, 1, max_hits)

# calculate the genomic position given the suffix array offsite
for i in range(num_seqs):
s = &seqs[i]
print("[bwa_cal_pac_pos_with_bwt]")
bwa_cal_pac_pos_with_bwt(self._index._bns, num_seqs, seqs, opt._delegate.max_diff, opt._delegate.fnr, self._index._bwt)

# TODO: can `_pacseq` be passed here?
# refine gapped alignment
print("[bwa_refine_gapped]")
bwa_refine_gapped(self._index._bns, num_seqs, seqs, NULL)

# create the AlignedSegment
Expand All @@ -181,6 +184,7 @@ cdef class Bwa:
q = queries[i]

# make a default, unmapped, empty record
print("[AlignedSegment]")
rec = AlignedSegment(header=self._index.header)
rec.query_name = q.name
rec.reference_id = -1
Expand All @@ -197,48 +201,56 @@ cdef class Bwa:
rec.query_sequence = q.sequence
rec.query_qualities = q.quality

if s.type != BWA_TYPE_NO_MATCH: # mapped read
ref_len_in_alignment = pos_end(s) - s.pos

# if on the reverse strand, reverse the query sequence and qualities
rec.query_sequence = q.sequence[::-1]
if q.quality is not None:
rec.query_qualities = q.quality[::-1]

# reference id
nn = bns_cnt_ambi(self._index._bns, s.pos, ref_len_in_alignment, &reference_id)
rec.reference_id = reference_id

# make this unmapped if we map off the end of the contig
rec.is_unmapped = s.pos + ref_len_in_alignment - self._index._bns.anns[rec.reference_id].offset > self._index._bns.anns[rec.reference_id].len

# strand, reference start, and mapping quality
if s.strand:
rec.is_reverse = True
rec.reference_start = s.pos - self._index._bns.anns[rec.reference_id].offset + 1
rec.mapping_quality = s.mapQ

# cigar
cigar = ""
if s.cigar:
for j in range(s.n_cigar):
cigar_len = __cigar_len(s.cigar[j])
cigar_op = "MIDS"[__cigar_op(s.cigar[j])]
cigar = f"{cigar}{cigar_len}{cigar_op}"
elif s.type != BWA_TYPE_NO_MATCH:
cigar = f"{s.len}M"

# tags
attrs = dict()
if s.type != BWA_TYPE_NO_MATCH:
attrs["MD"] = bwa_cal_md1(s.n_cigar, s.cigar, s.len, s.pos, s.rseq if s.strand else s.seq, self._index._bns.l_pac, pacseq, kstr, &nm)
rec.set_tags(list(attrs.items()))
# TODO: tags

recs.append(rec)

if s.type == BWA_TYPE_NO_MATCH: # umapped read
continue

print("[AlignedSegment] aligned")
ref_len_in_alignment = pos_end(s) - s.pos

# if on the reverse strand, reverse the query sequence and qualities
rec.query_sequence = q.sequence[::-1]
if q.quality is not None:
rec.query_qualities = q.quality[::-1]

# reference id
nn = bns_cnt_ambi(self._index._bns, s.pos, ref_len_in_alignment, &reference_id)
rec.reference_id = reference_id

# make this unmapped if we map off the end of the contig
rec.is_unmapped = s.pos + ref_len_in_alignment - self._index._bns.anns[rec.reference_id].offset > self._index._bns.anns[rec.reference_id].len

# strand, reference start, and mapping quality
if s.strand:
rec.is_reverse = True
rec.reference_start = s.pos - self._index._bns.anns[rec.reference_id].offset + 1
rec.mapping_quality = s.mapQ

# cigar
cigar = ""
if s.cigar:
for j in range(s.n_cigar):
cigar_len = __cigar_len(s.cigar[j])
cigar_op = "MIDS"[__cigar_op(s.cigar[j])]
cigar = f"{cigar}{cigar_len}{cigar_op}"
elif s.type != BWA_TYPE_NO_MATCH:
cigar = f"{s.len}M"

# tags
attrs = dict()
if s.type != BWA_TYPE_NO_MATCH:
attrs["MD"] = bwa_cal_md1(s.n_cigar, s.cigar, s.len, s.pos, s.rseq if s.strand else s.seq, self._index._bns.l_pac, self._pacseq, kstr, &nm)
rec.set_tags(list(attrs.items()))
# TODO: tags

print("[bwa_free_read_seq]")
bwa_free_read_seq(num_seqs, seqs)

free(kstr)
free(pacseq)

return recs
print("[done]")
return recs

def __dealloc__(self) -> None:
free(self._pacseq)
2 changes: 1 addition & 1 deletion tests/test_bwapy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_bwapy() -> None:
sequence = "gttacctgccgtgagtaaattaaaattttattgacttaggtcactaaatactttaaccaatataggcatagcgcacagac"
fastqs = [FastxRecord(name="test", sequence=sequence)]

recs = bwa.align(opt=opt, max_hits=1, query=fastqs[0])
recs = bwa.align(opt=opt, max_hits=1, queries=fastqs)
assert len(recs) == 1
rec = recs[0]
assert rec.query_name == "test"
Expand Down

0 comments on commit d7eec19

Please sign in to comment.