Merge branch 'stats_handle_lra' into 'dev'

stats_from_bam: handle cigar strings using =, X instead of M (specifically lra) See merge request research/pomoxis!151
nanoporetech · Jan 31, 2022 · 22f1811 · 22f1811
2 parents 3edcb05 + bb1d556
commit 22f1811
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,9 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ### Unreleased
+### Fixed
+- `stats_from_bam`: handle cigar strings using `=` and `X` instead of `M`.
 ### Changed
 - Include mapping quality in `stats_from_bam` output.
 ### Added
+- Handling of LRA bams in which NM tag is number of matches rather than edit distance. 
 - Added an option (`-y`) to `assess_assembly` and `mini_align` to include supplementary alignments. 
 - Added an option (`-d`) to `mini_align` and `assess_assembly` to select minimap2 alignment preset.
 - Added accumulation of errors over a number of chunks (`-a` option in `summary_from_stats` and `assess_assembly`) to get better stats.

diff --git a/pomoxis/stats_from_bam.py b/pomoxis/stats_from_bam.py
@@ -49,14 +49,24 @@ def stats_from_aligned_read(read, references, lengths):
         if first_cig[0] == 5:  # should always be true for minimap2
             start_offset = first_cig[1]
     counts, _ = read.get_cigar_stats()
-    match = counts[0] # alignment match (can be a sequence match or mismatch)
+    match = counts[0] + counts[7] + counts[8]  # total of M, =, and X
     ins = counts[1]
     delt = counts[2]
-    # NM is edit distance: NM = INS + DEL + SUB
-    sub = NM - ins - delt
+
+    lra_flag = False
+    if read.has_tag('NX'):
+        # likely from lra
+        # NM is number of matches, see https://github.com/ChaissonLab/LRA/issues/32
+        sub = counts[8]
+        lra_flag = True
+    else:
+        # likely from minimap2
+        # NM is edit distance: NM = INS + DEL + SUB
+        sub = NM - ins - delt
+
     length = match + ins + delt
     iden = 100 * float(match - sub) / match
-    acc = 100 - 100 * float(NM) / length
+    acc = 100 * float(match - sub) / length
 
     read_length = read.infer_read_length()
     coverage = 100 * float(read.query_alignment_length) / read_length
@@ -84,7 +94,7 @@ def stats_from_aligned_read(read, references, lengths):
         "mapq": read.mapping_quality,
     }
 
-    return results
+    return results, lra_flag
 
 
 def masked_stats_from_aligned_read(read, references, lengths, tree):
@@ -186,6 +196,7 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be
             if not all_alignments and (read.is_secondary or read.is_supplementary):
                 continue
 
+            lra_flag = False
             if bed_file is not None:
                 if not trees[read.reference_name].overlaps(read.reference_start, read.reference_end):
                     sys.stderr.write('read {} does not overlap with any regions in bedfile\n'.format(read.query_name))
@@ -197,15 +208,15 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be
                         counts['all_matches_masked'] +=  1
                         continue
             else:
-                result = stats_from_aligned_read(read, bam.references, bam.lengths)
+                result, lra_flag = stats_from_aligned_read(read, bam.references, bam.lengths)
 
             if min_length is not None and result['length'] < min_length:
                 counts['short'] += 1
                 continue
 
             results.append(result)
 
-    return counts, results
+    return counts, results, lra_flag
 
 
 def main(arguments=None):
@@ -245,17 +256,23 @@ def main(arguments=None):
                              all_alignments=args.all_alignments,
                              min_length=args.min_length, bed_file=args.bed)
 
+    detected_lra = False
     counts = Counter()
-    for batch_counts, results in mapper(func, ranges):
+    for batch_counts, results, lra_flag in mapper(func, ranges):
         counts.update(batch_counts)
         counts['total'] += len(results)
         for result in results:
             out_row = (str(result[x]) for x in headers)
             args.output.write('\t'.join(out_row))
             args.output.write('\n')
+        detected_lra = detected_lra or lra_flag
     if pool is not None:
         pool.shutdown(wait=True)
 
+    if detected_lra:
+        args.summary.write('Some reads contained an NX tag, assuming these were aligned with LRA.\n')
+        args.summary.write('If this is not the case, results may be wrong.\n')
+
     if counts['total'] == 0:
         args.summary.write('No alignments processed. Check your bam and filtering options.\n')