Skip to content

Commit

Permalink
Merge branch 'stats_handle_lra' into 'dev'
Browse files Browse the repository at this point in the history
stats_from_bam: handle cigar strings using =, X instead of M (specifically lra)

See merge request research/pomoxis!151
  • Loading branch information
ftostevin-ont committed Jan 31, 2022
2 parents 3edcb05 + bb1d556 commit 22f1811
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 8 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

### Unreleased
### Fixed
- `stats_from_bam`: handle cigar strings using `=` and `X` instead of `M`.
### Changed
- Include mapping quality in `stats_from_bam` output.
### Added
- Handling of LRA bams in which NM tag is number of matches rather than edit distance.
- Added an option (`-y`) to `assess_assembly` and `mini_align` to include supplementary alignments.
- Added an option (`-d`) to `mini_align` and `assess_assembly` to select minimap2 alignment preset.
- Added accumulation of errors over a number of chunks (`-a` option in `summary_from_stats` and `assess_assembly`) to get better stats.
Expand Down
33 changes: 25 additions & 8 deletions pomoxis/stats_from_bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,24 @@ def stats_from_aligned_read(read, references, lengths):
if first_cig[0] == 5: # should always be true for minimap2
start_offset = first_cig[1]
counts, _ = read.get_cigar_stats()
match = counts[0] # alignment match (can be a sequence match or mismatch)
match = counts[0] + counts[7] + counts[8] # total of M, =, and X
ins = counts[1]
delt = counts[2]
# NM is edit distance: NM = INS + DEL + SUB
sub = NM - ins - delt

lra_flag = False
if read.has_tag('NX'):
# likely from lra
# NM is number of matches, see https://github.com/ChaissonLab/LRA/issues/32
sub = counts[8]
lra_flag = True
else:
# likely from minimap2
# NM is edit distance: NM = INS + DEL + SUB
sub = NM - ins - delt

length = match + ins + delt
iden = 100 * float(match - sub) / match
acc = 100 - 100 * float(NM) / length
acc = 100 * float(match - sub) / length

read_length = read.infer_read_length()
coverage = 100 * float(read.query_alignment_length) / read_length
Expand Down Expand Up @@ -84,7 +94,7 @@ def stats_from_aligned_read(read, references, lengths):
"mapq": read.mapping_quality,
}

return results
return results, lra_flag


def masked_stats_from_aligned_read(read, references, lengths, tree):
Expand Down Expand Up @@ -186,6 +196,7 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be
if not all_alignments and (read.is_secondary or read.is_supplementary):
continue

lra_flag = False
if bed_file is not None:
if not trees[read.reference_name].overlaps(read.reference_start, read.reference_end):
sys.stderr.write('read {} does not overlap with any regions in bedfile\n'.format(read.query_name))
Expand All @@ -197,15 +208,15 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be
counts['all_matches_masked'] += 1
continue
else:
result = stats_from_aligned_read(read, bam.references, bam.lengths)
result, lra_flag = stats_from_aligned_read(read, bam.references, bam.lengths)

if min_length is not None and result['length'] < min_length:
counts['short'] += 1
continue

results.append(result)

return counts, results
return counts, results, lra_flag


def main(arguments=None):
Expand Down Expand Up @@ -245,17 +256,23 @@ def main(arguments=None):
all_alignments=args.all_alignments,
min_length=args.min_length, bed_file=args.bed)

detected_lra = False
counts = Counter()
for batch_counts, results in mapper(func, ranges):
for batch_counts, results, lra_flag in mapper(func, ranges):
counts.update(batch_counts)
counts['total'] += len(results)
for result in results:
out_row = (str(result[x]) for x in headers)
args.output.write('\t'.join(out_row))
args.output.write('\n')
detected_lra = detected_lra or lra_flag
if pool is not None:
pool.shutdown(wait=True)

if detected_lra:
args.summary.write('Some reads contained an NX tag, assuming these were aligned with LRA.\n')
args.summary.write('If this is not the case, results may be wrong.\n')

if counts['total'] == 0:
args.summary.write('No alignments processed. Check your bam and filtering options.\n')

Expand Down

0 comments on commit 22f1811

Please sign in to comment.