Merge remote-tracking branch 'origin/development' into main

vgteam · Jul 6, 2022 · d933a10 · d933a10
2 parents a8d5cdd + 4e67127
commit d933a10
Show file tree

Hide file tree

Showing 72 changed files with 3,001 additions and 1,019 deletions.
diff --git a/Makefile b/Makefile
@@ -17,12 +17,15 @@ AR=ar
 AR_FLAGS=-rsc
 
 ifndef BUILD_EXAMPLES 
-BUILD_EXAMPLES=1
+BUILD_EXAMPLES=0
 endif
-
 ifndef BUILD_TOOLS 
 BUILD_TOOLS=1
 endif
+ifndef BUILD_WFA_PARALLEL
+BUILD_WFA_PARALLEL=1
+endif
+
 ###############################################################################
 # Configuration rules
 ###############################################################################
@@ -41,16 +44,17 @@ ifeq ($(BUILD_EXAMPLES),1)
     APPS+=examples
 endif
 
-release: CC_FLAGS+=-O3 -march=native -flto
-release: build
-
-all: CC_FLAGS+=-O3 -march=native
+all: CC_FLAGS+=-O3 -march=native #-flto -ffat-lto-objects
 all: build
 
 debug: build
 
+ASAN_OPT=-fsanitize=address -fsanitize=undefined -fsanitize=shift -fsanitize=alignment
+ASAN_OPT+=-fsanitize=signed-integer-overflow -fsanitize=bool -fsanitize=enum
+ASAN_OPT+=-fsanitize=pointer-compare -fsanitize=pointer-overflow -fsanitize=builtin
+
 # ASAN: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=2:log_threads=1
-asan: CC_FLAGS+=-fsanitize=address -fno-omit-frame-pointer -fno-common
+asan: CC_FLAGS+=$(ASAN_OPT) -fno-omit-frame-pointer -fno-common
 asan: build
 
 ###############################################################################

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 ### 1.1 What is WFA?
 
-The wavefront alignment (WFA) algorithm is an **exact** gap-affine algorithm that takes advantage of homologous regions between the sequences to accelerate the alignment process. Unlike to traditional dynamic programming algorithms that run in quadratic time, the WFA runs in time `O(ns+s^2)`, proportional to the sequence length `n` and the alignment score `s`, using `O(s^2)` memory. Moreover, the WFA algorithm exhibits simple computational patterns that the modern compilers can automatically vectorize for different architectures without adapting the code. To intuitively illustrate why the WFA algorithm is so interesting, look at the following figure. The left panel shows the cells computed by a classical dynamic programming based algorithm (like Smith-Waterman or Needleman Wunsch). In contrast, the right panel shows the cells computed by the WFA algorithm to obtain the same result (i.e., the optimal alignment).
+The wavefront alignment (WFA) algorithm is an **exact** gap-affine algorithm that takes advantage of homologous regions between the sequences to accelerate the alignment process. Unlike to traditional dynamic programming algorithms that run in quadratic time, the WFA runs in time `O(ns+s^2)`, proportional to the sequence length `n` and the alignment score `s`, using `O(s^2)` memory (or `O(s)` using the ultralow/BiWFA mode). Moreover, the WFA algorithm exhibits simple computational patterns that the modern compilers can automatically vectorize for different architectures without adapting the code. To intuitively illustrate why the WFA algorithm is so interesting, look at the following figure. The left panel shows the cells computed by a classical dynamic programming based algorithm (like Smith-Waterman or Needleman Wunsch). In contrast, the right panel shows the cells computed by the WFA algorithm to obtain the same result (i.e., the optimal alignment).
 
 <p align = "center">
 <img src = "img/wfa.vs.swg.png" width="750px">
@@ -47,7 +47,7 @@ Section [WFA2-lib features](#wfa2.features) explores the most relevant options a
 - The WFA algorithm is an **exact algorithm**. If no heuristic is applied (e.g., band or adaptive pruning), the core algorithm guarantees to always find the optimal solution (i.e., best alignment score). Since its first release, some authors have referenced the WFA as approximated or heuristic, which is NOT the case.
 
 
-- Given two sequences of length `n`, traditional dynamic-programming (DP) based methods (like Smith-Waterman or Needleman-Wunsch) compute the optimal alignment in `O(n^2)` time, using `O(n^2)` memory. In contrast, the WFA algorithm requires `O(ns+s^2)` time and `O(s^2)` memory (being `s` the optimal alignment score). Therefore, **the memory consumption of the WFA algorithm is not intrinsically higher than that of other methods**. Most DP-based methods can use heuristics (like banded, X-drop, or Z-drop) to reduce the execution time and the memory usage at the expense of losing accuracy. Likewise, **the WFA algorithm can also use heuristics to reduce the execution time and memory usage**.
+- Given two sequences of length `n`, traditional dynamic-programming (DP) based methods (like Smith-Waterman or Needleman-Wunsch) compute the optimal alignment in `O(n^2)` time, using `O(n^2)` memory. In contrast, the WFA algorithm requires `O(ns+s^2)` time and `O(s^2)` memory (being `s` the optimal alignment score). Therefore, **the memory consumption of the WFA algorithm is not intrinsically higher than that of other methods**. Most DP-based methods can use heuristics (like banded, X-drop, or Z-drop) to reduce the execution time and the memory usage at the expense of losing accuracy. Likewise, **the WFA algorithm can also use heuristics to reduce the execution time and memory usage**. Moreover, the memory mode `ultralow` uses the BiWFA algorithm to execute in `O(ns+s^2)` time and linear `O(s)` memory.
 
 
 - **A note for the fierce competitors.** I can understand that science and publishing have become a fierce competition these days. Many researchers want their methods to be successful and popular, seeking funding, tenure, or even fame. If you are going to benchmark the WFA using the least favourable configuration, careless programming, and a disadvantageous setup, please, go ahead. But remember, researchers like you have put a lot of effort into developing the WFA. We all joined this "competition" because we sought to find better methods that could truly help other researchers. So, try to be nice, tone down the marketing, and produce fair evaluations and honest publications.
@@ -147,7 +147,7 @@ cout << "Alignment score " << aligner.getAlignmentScore() << endl;
 * **Exact alignment** method that computes the optimal **alignment score** and/or **alignment CIGAR**.
 * Supports **multiple distance metrics** (i.e., indel, edit, gap-lineal, gap-affine, and dual-gap gap-affine).
 * Allows performing **end-to-end** (a.k.a. global) and **ends-free** (e.g., semi-global, extension, overlap) alignment.
-* Implements **low-memory modes** to reduce and control memory consumption.
+* Implements **low-memory modes** to reduce and control memory consumption (down to `O(s)` using the `ultralow` mode).
 * Supports various **heuristic strategies** to use on top of the core WFA algorithm.
 * WFA2-lib **operates with plain ASCII strings**. Although we mainly focus on aligning DNA/RNA sequences, the WFA algorithm and the WFA2-lib implementation work with any pair of strings. Moreover, these sequences do not have to be pre-processed (e.g., packed or profiled), nor any table must be precomputed (like the query profile, used within some Smith-Waterman implementations).
 * Due to its simplicity, the WFA algorithm can be automatically vectorized for any SIMD-compliant CPU supported by the compiler. For this reason, **the WFA2-lib implementation is independent of any specific ISA or processor model**. Unlike other hardware-dependent libraries, we aim to offer a multiplatform pairwise-alignment library that can be executed on different processors and models (e.g., SSE, AVX2, AVX512, POWER-ISA, ARM, NEON, SVE, SVE2, RISCV-RVV, ...).
@@ -385,7 +385,7 @@ The WFA2 library allows computing alignments with different spans or shapes. Alt
 
 ### <a name="wfa2.mem"></a> 3.4 Memory modes
 
-The WFA2 library implements various memory modes: `wavefront_memory_high`, `wavefront_memory_med`, `wavefront_memory_low`. These modes allow regulating the overall memory consumption at the expense of execution time. The standard WFA algorithm, which stores explicitly all wavefronts in memory, correspond to the mode `wavefront_memory_high`. The other methods progressively reduce memory usage at the expense of slightly larger alignment times. These memory modes can be used transparently with other alignment options and generate identical results. Note that this option does not affect the score-only alignment mode (it already uses a minimal memory footprint of `O(s)`).
+The WFA2 library implements various memory modes: `wavefront_memory_high`, `wavefront_memory_med`, `wavefront_memory_low`, and , `wavefront_memory_ultralow`. These modes allow regulating the overall memory consumption at the expense of execution time. The standard WFA algorithm, which stores explicitly all wavefronts in memory, correspond to the mode `wavefront_memory_high`. The other methods progressively reduce memory usage at the expense of slightly larger alignment times. These memory modes can be used transparently with other alignment options and generate identical results. Note that this option does not affect the score-only alignment mode (it already uses a minimal memory footprint of `O(s)`).
 
 ```C
   wavefront_aligner_attr_t attributes = wavefront_aligner_attr_default;
@@ -394,7 +394,7 @@ The WFA2 library implements various memory modes: `wavefront_memory_high`, `wave
 
 ### <a name="wfa2.heuristics"></a> 3.5 Heuristic modes
 
-The WFA algorithm can be used combined with many heuristics to reduce the alignment time and memory used. As it happens to other alignment methods, heuristics can result in suboptimal solutions and loss of accuracy. Moreover, some heuristics may drop the alignment if the sequences exceed certain divergence thresholds (i.e., x-drop/z-drop). Due to the popularity and efficiency of these methods, the WFA2 library implements many of these heuristics. Note, **it is not about how little DP-matrix you compute, but about how good the resulting alignments are.**
+The WFA algorithm can be used combined with many heuristics to reduce the alignment time and memory used. As it happens to other alignment methods, heuristics can result in suboptimal solutions and loss of accuracy. Moreover, some heuristics may drop the alignment if the sequences exceed certain divergence thresholds (i.e., x-drop/z-drop). Due to the popularity and efficiency of these methods, the WFA2 library implements many of these heuristics. Note, **it is not about how little DP-matrix you compute, but about how good/accurate the resulting alignments are.**
 
 - **None (for comparison)**. If no heuristic is used, the WFA behaves exploring cells of the DP-matrix in increasing score order (increasing scores correspond to colours from blue to red). 
 
@@ -500,11 +500,20 @@ The WFA algorithm can be used combined with many heuristics to reduce the alignm
   attributes.heuristic.steps_between_cutoffs = 100;
 ```
 
-<!-- ### <a name="wfa2.other"></a> 3.6 Other features -->
+### <a name="wfa2.other.notes"></a> 3.6 Some technical notes
+
+- Thanks to Eizenga's formulation, WFA2-lib can operate with any match score. Although, in practice, M=0 is still the most efficient choice as it takes advantage of matches between the sequences to accelerate the alignment computation.
+
+- All WFA2-lib algorithms/variants are stable. That is, for alignments having the same score, the library always resolves ties (between M, X, I,and D) using the same criteria: M (highest prio) > X > D > I (lowest prio). Nevertheless, the memory mode `ultralow` (BiWFA) is optimal (always reports the best alignment) but not stable.
 
 ## <a name="wfa2.complains"></a> 4. REPORTING BUGS AND FEATURE REQUEST
 
-Feedback and bug reporting is highly appreciated. Please report any issue or suggestion on github or email to the main developer ([email protected]).
+Feedback and bug reporting is highly appreciated. Please report any issue or suggestion on github or email to the main developer ([email protected]). Don't hesitate to contact us
+if:
+  - You experience any bug or crash.
+  - You want to request a feature or have any suggestion.
+  - Your application using the library is running slower than it should or you expected.
+  - Need help integrating the library into your tool.
 
 ## <a name="wfa2.licence"></a> 5. LICENSE
 
@@ -528,3 +537,5 @@ Miquel Moretó has contributed with fruitful technical discussions and tireless
 
 **Santiago Marco-Sola, Juan Carlos Moure, Miquel Moreto, Antonio Espinosa**. ["Fast gap-affine pairwise alignment using the wavefront algorithm."](https://doi.org/10.1093/bioinformatics/btaa777) Bioinformatics, 2020.
 
+**Santiago Marco-Sola, Jordan M Eizenga, Andrea Guarracino, Benedict Paten, Erik Garrison, Miquel Moreto**. Optimal gap-affine alignment in O(s) space.  _bioRxiv_  (2022). DOI [2022.04.14.488380](https://doi.org/10.1101/2022.04.14.488380)
+
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-v2.1
+v2.2
diff --git a/alignment/cigar.c b/alignment/cigar.c
@@ -251,6 +251,41 @@ void cigar_copy(
          cigar_src->operations+cigar_src->begin_offset,
          cigar_src->end_offset-cigar_src->begin_offset);
 }
+void cigar_append(
+    cigar_t* const cigar_dst,
+    cigar_t* const cigar_src) {
+  // Append
+  const int cigar_length = cigar_src->end_offset - cigar_src->begin_offset;
+  char* const operations_src = cigar_src->operations + cigar_src->begin_offset;
+  char* const operations_dst = cigar_dst->operations + cigar_dst->end_offset;
+  memcpy(operations_dst,operations_src,cigar_length);
+  // Update offset
+  cigar_dst->end_offset += cigar_length;
+}
+void cigar_append_deletion(
+    cigar_t* const cigar,
+    const int length) {
+  // Append deletions
+  char* const operations = cigar->operations + cigar->end_offset;
+  int i;
+  for (i=0;i<length;++i) {
+    operations[i] = 'D';
+  }
+  // Update offset
+  cigar->end_offset += length;
+}
+void cigar_append_insertion(
+    cigar_t* const cigar,
+    const int length) {
+  // Append insertions
+  char* const operations = cigar->operations + cigar->end_offset;
+  int i;
+  for (i=0;i<length;++i) {
+    operations[i] = 'I';
+  }
+  // Update offset
+  cigar->end_offset += length;
+}
 bool cigar_check_alignment(
     FILE* const stream,
     const char* const pattern,

diff --git a/alignment/cigar.h b/alignment/cigar.h
@@ -104,6 +104,17 @@ int cigar_cmp(
 void cigar_copy(
     cigar_t* const cigar_dst,
     cigar_t* const cigar_src);
+
+void cigar_append(
+    cigar_t* const cigar_dst,
+    cigar_t* const cigar_src);
+void cigar_append_deletion(
+    cigar_t* const cigar,
+    const int length);
+void cigar_append_insertion(
+    cigar_t* const cigar,
+    const int length);
+
 bool cigar_check_alignment(
     FILE* const stream,
     const char* const pattern,

diff --git a/bindings/cpp/WFAligner.cpp b/bindings/cpp/WFAligner.cpp
@@ -51,6 +51,7 @@ WFAligner::WFAligner(
     case MemoryHigh: this->attributes.memory_mode = wavefront_memory_high; break;
     case MemoryMed: this->attributes.memory_mode = wavefront_memory_med; break;
     case MemoryLow: this->attributes.memory_mode = wavefront_memory_low; break;
+    case MemoryUltralow: this->attributes.memory_mode = wavefront_memory_ultralow; break;
     default: this->attributes.memory_mode = wavefront_memory_high; break;
   }
   this->attributes.alignment_scope = (alignmentScope==Score) ? compute_score : compute_alignment;
@@ -127,9 +128,11 @@ WFAligner::AlignmentStatus WFAligner::alignEndsFree(
     const int textBeginFree,
     const int textEndFree) {
   // Delegate
-  return alignEnd2End(
+  return alignEndsFree(
       pattern.c_str(),pattern.length(),
-      text.c_str(),text.length());
+      patternBeginFree,patternEndFree,
+      text.c_str(),text.length(),
+      textBeginFree,textEndFree);
 }
 /*
  * Alignment resume
@@ -194,11 +197,9 @@ void WFAligner::setMaxAlignmentScore(
       wfAligner,maxAlignmentScore);
 }
 void WFAligner::setMaxMemory(
-    const uint64_t maxMemoryCompact,
     const uint64_t maxMemoryResident,
     const uint64_t maxMemoryAbort) {
-  wavefront_aligner_set_max_memory(wfAligner,
-      maxMemoryCompact,maxMemoryResident,maxMemoryAbort);
+  wavefront_aligner_set_max_memory(wfAligner,maxMemoryResident,maxMemoryAbort);
 }
 /*
  * Accessors

diff --git a/bindings/cpp/WFAligner.hpp b/bindings/cpp/WFAligner.hpp
@@ -53,6 +53,7 @@ class WFAligner {
     MemoryHigh,
     MemoryMed,
     MemoryLow,
+    MemoryUltralow,
   };
   enum AlignmentScope {
     Score,
@@ -129,7 +130,6 @@ class WFAligner {
   void setMaxAlignmentScore(
       const int maxAlignmentScore);
   void setMaxMemory(
-      const uint64_t maxMemoryCompact,
       const uint64_t maxMemoryResident,
       const uint64_t maxMemoryAbort);
   // Accessors

diff --git a/examples/Makefile b/examples/Makefile
@@ -9,17 +9,18 @@ FOLDER_BIN=bin
 # Rules
 ###############################################################################
 LIB_WFA=$(FOLDER_LIB)/libwfa.a
+LIBS=-fopenmp -lm
 
 all: setup examples_c examples_cpp
 
 examples_c: *.c $(LIB_WFA)
-	$(CC) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_basic.c -o $(FOLDER_BIN)/wfa_basic -lwfa -lm
-	$(CC) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_adapt.c -o $(FOLDER_BIN)/wfa_adapt -lwfa -lm
-	$(CC) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_repeated.c -o $(FOLDER_BIN)/wfa_repeated -lwfa -lm
+	$(CC) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_basic.c -o $(FOLDER_BIN)/wfa_basic -lwfa $(LIBS)
+	$(CC) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_adapt.c -o $(FOLDER_BIN)/wfa_adapt -lwfa $(LIBS)
+	$(CC) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_repeated.c -o $(FOLDER_BIN)/wfa_repeated -lwfa $(LIBS)
 
 examples_cpp: *.cpp $(LIB_WFA)
-	$(CPP) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_bindings.cpp -o $(FOLDER_BIN)/wfa_bindings -lwfacpp
-	$(CPP) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_lambda.cpp -o $(FOLDER_BIN)/wfa_lambda -lwfacpp
+	$(CPP) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_bindings.cpp -o $(FOLDER_BIN)/wfa_bindings -lwfacpp $(LIBS)
+	$(CPP) $(CC_FLAGS) -L$(FOLDER_LIB) -I$(FOLDER_WFA) wfa_lambda.cpp -o $(FOLDER_BIN)/wfa_lambda -lwfacpp $(LIBS)
 
 setup:
 	@mkdir -p $(FOLDER_BIN)

diff --git a/scripts/alg.cmp.score.sh b/scripts/alg.cmp.score.sh
diff --git a/scripts/fasta2seq.sh b/scripts/fasta2seq.sh
@@ -1,2 +1,13 @@
 #!/bin/bash
-awk '{if($0~/^>/){printf("\n>")}else{printf($0)}}' | tail -n +2
+# PROJECT: Wavefront Alignments Algorithms 
+# LICENCE: MIT License 
+# AUTHOR(S): Santiago Marco-Sola <[email protected]>
+# DESCRIPTION: Convert FASTA format to SEQ format
+# USAGE: ./fasta2seq.sh file.fasta file.seq 
+
+FILE_FASTA=$1
+FILE_SEQ=$2
+
+awk '{ if (NR%4==1) {printf(">")} \
+       else if (NR%4==3) {printf("<")} \
+       else {print($0)} }' $FILE_FASTA > $FILE_SEQ
diff --git a/scripts/seq2fasta.sh b/scripts/seq2fasta.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# PROJECT: Wavefront Alignments Algorithms 
+# LICENCE: MIT License 
+# AUTHOR(S): Santiago Marco-Sola <[email protected]>
+# DESCRIPTION: Convert SEQ format to FASTA format
+# USAGE: ./seq2fasta.sh file.seq file.fasta
+
+FILE_SEQ=$1
+FILE_FASTA=$2
+
+cat $FILE_SEQ | paste - - | awk '{ \
+  s1=substr($1,2,length($1)-1);    \
+  s2=substr($2,2,length($2)-1);    \
+  printf(">Seq.%d/1\n%s\n>Seq.%d/2\n%s\n",NR,s1,NR,s2)}' > $FILE_FASTA
diff --git a/scripts/wfa.utest.cmp.sh → scripts/utests/wfa.utest.cmp.alignments.sh b/scripts/wfa.utest.cmp.sh → scripts/utests/wfa.utest.cmp.alignments.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
-# PROJECT: Wavefront Alignments Algorithms
+# PROJECT: Wavefront Alignments Algorithms (Unitary Tests)
 # LICENCE: MIT License 
 # AUTHOR(S): Santiago Marco-Sola <[email protected]>
-# DESCRIPTION: WFA unitary tests (for performance & correcness)
-# USAGE: ./wfa.utest.cmp.sh WFAv1 WFAv2
+# DESCRIPTION: Compares alignments (*.alg) from two folders
+# USAGE: ./wfa.utest.cmp.alignments.sh wfa_results_folder_1 wfa_results_folder_2
 
 # Parameters
 FOLDER1=$1
@@ -15,17 +15,25 @@ do
   FILENAME=$(basename -- "$FILE_ALG1")
   PREFIX=${FILENAME%.*}
   FILE_ALG2="$FOLDER2/$FILENAME"
-  echo -ne "[UTest::$PREFIX]  \t"
+  echo -ne "[UTest::$PREFIX]"
+  if [[ ${#PREFIX} < 15 ]]; then echo -ne "   "; fi
+  echo -ne "\t"
   # Check existence
-  if [[ ! -f "$FILE_ALG2" ]]; then
+  if [[ ! -f "$FILE_ALG2" ]]
+  then
     echo "$FILE_ALG2 doesn't exist."
     continue
   fi
   # Check diff
   if [[ $(diff $FILE_ALG1 $FILE_ALG2) ]] 
   then
-    echo "Error"
-    continue
+    if [[ $(diff <(awk '{if ($1<0) print -$1; else print $1}' $FILE_ALG1) <(awk '{if ($1<0) print -$1; else print $1}' $FILE_ALG2)) ]]
+    then
+      echo "Error"
+      continue
+    else
+      echo -n "OK" # Only score
+    fi
   else
     echo -n "OK"
   fi