From 965dbfcd8de2501280172fc453f27df0bffcd9c7 Mon Sep 17 00:00:00 2001 From: smarco Date: Mon, 4 Apr 2022 21:49:48 +0200 Subject: [PATCH] Implemented exact-prunning on edit-distance --- wavefront/wavefront_compute_edit.c | 84 ++++++++++++++++++++++++++++++ wavefront/wavefront_heuristic.c | 7 --- wavefront/wavefront_slab.c | 2 +- 3 files changed, 85 insertions(+), 8 deletions(-) diff --git a/wavefront/wavefront_compute_edit.c b/wavefront/wavefront_compute_edit.c index 61d42f6c..46561b23 100644 --- a/wavefront/wavefront_compute_edit.c +++ b/wavefront/wavefront_compute_edit.c @@ -188,6 +188,85 @@ void wavefront_compute_edit_idm_piggyback( curr_offsets[k] = max; } } +/* + * Exact pruning paths + */ +int wf_compute_edit_best_score( + const int pattern_length, + const int text_length, + const int k, + const wf_offset_t offset) { + // Compute best-alignment case + const int left_v = pattern_length - WAVEFRONT_V(k,offset); + const int left_h = text_length - WAVEFRONT_H(k,offset); + return (left_v >= left_h) ? left_v - left_h : left_h - left_v; +} +int wf_compute_edit_worst_score( + const int pattern_length, + const int text_length, + const int k, + const wf_offset_t offset) { + // Compute worst-alignment case + const int left_v = pattern_length - WAVEFRONT_V(k,offset); + const int left_h = text_length - WAVEFRONT_H(k,offset); + return MAX(left_v,left_h); +} +void wavefront_compute_edit_exact_prune( + wavefront_aligner_t* const wf_aligner, + wavefront_t* const wavefront) { + // Parameters + const int plen = wf_aligner->pattern_length; + const int tlen = wf_aligner->text_length; + wf_offset_t* const offsets = wavefront->offsets; + const int lo = wavefront->lo; + const int hi = wavefront->hi; + // Speculative compute if needed + if (WAVEFRONT_LENGTH(lo,hi) < 1000) return; + const int sample_k = lo + (hi-lo)/2; + const wf_offset_t sample_offset = offsets[sample_k]; + if (sample_offset < 0) return; // Unlucky null in the middle + const int smax_sample = wf_compute_edit_worst_score(plen,tlen,sample_k,offsets[sample_k]); + const int smin_lo = wf_compute_edit_best_score(plen,tlen,lo,offsets[lo]); + const int smin_hi = wf_compute_edit_best_score(plen,tlen,hi,offsets[hi]); + if (smin_lo <= smax_sample && smin_hi <= smax_sample) return; + /* + * Suggested by Heng Li as an effective exact-prunning technique + * for sequences of very different length where some diagonals + * can be proven impossible to yield better alignments. + */ + // Compute the best worst-case-alignment + int score_min_worst = INT_MAX; + int k; + for (k=lo;k<=hi;++k) { + const wf_offset_t offset = offsets[k]; + if (offset < 0) continue; // Skip nulls + // Compute worst-alignment case + const int score_worst = wf_compute_edit_worst_score(plen,tlen,k,offset); + if (score_worst < score_min_worst) score_min_worst = score_worst; + } + // Compare against the best-case-alignment (Prune from bottom) + int lo_reduced = lo; + for (k=lo;k<=hi;++k) { + // Compute best-alignment case + const wf_offset_t offset = offsets[k]; + const int score_best = wf_compute_edit_best_score(plen,tlen,k,offset); + // Compare best and worst + if (score_best <= score_min_worst) break; + ++lo_reduced; + } + wavefront->lo = lo_reduced; + // Compare against the best-case-alignment (Prune from top) + int hi_reduced = hi; + for (k=hi;k>lo_reduced;--k) { + // Compute best-alignment case + const wf_offset_t offset = offsets[k]; + const int score_best = wf_compute_edit_best_score(plen,tlen,k,offset); + // Compare best and worst + if (score_best <= score_min_worst) break; + --hi_reduced; + } + wavefront->hi = hi_reduced; +} /* * Compute next wavefront */ @@ -268,6 +347,11 @@ void wavefront_compute_edit( } // Trim wavefront ends wavefront_compute_trim_ends(wf_aligner,wf_curr); + // Exact pruning paths + if (wf_aligner->alignment_form.span == alignment_end2end && + wf_aligner->penalties.distance_metric == edit) { + wavefront_compute_edit_exact_prune(wf_aligner,wf_curr); + } } diff --git a/wavefront/wavefront_heuristic.c b/wavefront/wavefront_heuristic.c index a001ad91..eb8c180b 100644 --- a/wavefront/wavefront_heuristic.c +++ b/wavefront/wavefront_heuristic.c @@ -108,13 +108,6 @@ void wavefront_heuristic_clear( /* * Utils */ -//int wf_compute_antidiagonal( -// const wf_offset_t offset, -// const int k) { -// const int v = WAVEFRONT_V(k,offset); -// const int h = WAVEFRONT_H(k,offset); -// return v + h; -//} int wf_compute_distance_end2end( const wf_offset_t offset, const int k, diff --git a/wavefront/wavefront_slab.c b/wavefront/wavefront_slab.c index 5ef8ab3d..89a897b5 100644 --- a/wavefront/wavefront_slab.c +++ b/wavefront/wavefront_slab.c @@ -258,7 +258,7 @@ wavefront_t* wavefront_slab_allocate( void wavefront_slab_free( wavefront_slab_t* const wavefront_slab, wavefront_t* const wavefront) { - // Check reasons to repurpose wavefront + // Check reasons to repurpose wavefront (NOTE: Tight-mode never slab_frees()) // (A) Reuse-mode and wavefront has current wf-length // (B) Tight-mode and wavefront has init wf-length const int wf_length = wavefront->wf_elements_allocated;