Run e2e-tests in separate dir + reuse_files + record_history (#41)

* b2w and inference: rerun windows that failed + check if window was processed already before adding process to process list * reference to manuscript * restarting failed process doesn't work, exit again * fix: check if files already exist * add minimum number of iterations * pass unique_modus argument * Revert "Merge pull request #38 from cbg-ethz/enable-unique-modus" This reverts commit ffd4fd1, reversing changes made to 279cc63. * rebase to master * reference to manuscript * restarting failed process doesn't work, exit again * add minimum number of iterations * pass unique_modus argument * Revert "Merge pull request #38 from cbg-ethz/enable-unique-modus" This reverts commit ffd4fd1, reversing changes made to 279cc63. * update readme * no poerty lock flie * add param record_history * add command line params: reuse_files + record_history * pass record_history + implement reuse_files * implemnt reuse_files * typo * add param record_history * fix syntax * [fix[ unexpected keyword argument 'metavar' * fix ctime * add pathlib package * fix the pathlib import * fix: too many argmuments to unpack * fix params to unpack * [e2e tests] run the e2e tests in seperate directories * partially addresses #40 * enable unique_modus
cbg-ethz · Dec 3, 2024 · aebc82d · aebc82d
1 parent 65e6058
commit aebc82d
Show file tree

Hide file tree

Showing 11 changed files with 198 additions and 895 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -11,12 +11,12 @@ RUN apt-get update -y && \
 
 RUN pip install "poetry==$POETRY_VERSION"
 
-COPY pyproject.toml poetry.lock /usr/app/
+COPY pyproject.toml /usr/app/
 
 # https://stackoverflow.com/questions/53835198/integrating-python-poetry-with-docker
 RUN cd /usr/app && poetry install --no-interaction --no-ansi --no-root
 
 # GitHub Actions chimes in here and sets docker's WORKDIR=${GITHUB_WORKSPACE}
 # https://docs.github.com/en/actions/creating-actions/dockerfile-support-for-github-actions#workdir
 # poetry install --only-root would be more elegant but does not work in Github Actions
-CMD poetry install --no-interaction --no-ansi && cd ./tests && poetry run pytest
+CMD poetry install --no-interaction --no-ansi && cd ./tests && poetry run pytest
diff --git a/poetry.lock b/poetry.lock
diff --git a/tests/data_1/shotgun_test.sh b/tests/data_1/shotgun_test.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
 viloca run -a 0.1 -w 201 --mode shorah -x 100000 -p 0.9 -c 0 \
--r HXB2:2469-3713 -R 42 -f test_ref.fasta -b test_aln.cram --out_format csv "$@"
+-r HXB2:2469-3713 -R 42 -f ../test_ref.fasta -b ../test_aln.cram --out_format csv "$@"
diff --git a/tests/test_shotgun_e2e.py b/tests/test_shotgun_e2e.py
@@ -27,22 +27,31 @@ def run(base_files):
     run(base_files)
 
 def test_e2e_shorah():
+    cwd_test_e2e = "./data_1/test_e2e_shorah"
+    # Check if the directory exists, if not, create it
+    if not os.path.exists(cwd_test_e2e):
+        os.makedirs(cwd_test_e2e)
+
     original = subprocess.run(
-        "./shotgun_test.sh", shell=True, check=True, cwd=cwd
+        "../shotgun_test.sh", shell=True, check=True, cwd=cwd_test_e2e
     )
     assert original.returncode == 0
     assert os.path.isfile(os.path.join(cwd, f_path)) == False
 
     assert filecmp.cmp(
         "./data_1/test.csv",
-        "./data_1/work/snv/SNVs_0.010000_final.csv",
+        "./data_1/test_e2e_shorah/work/snv/SNVs_0.010000_final.csv",
         shallow=False
     )
 
 def test_e2e_shorah_with_extended_window_mode():
+    cwd_test_e2e_extended_window_mode = "./data_1/test_e2e_extended_window_modee"
+    # Check if the directory exists, if not, create it
+    if not os.path.exists(cwd_test_e2e_extended_window_mode):
+        os.makedirs(cwd_test_e2e_extended_window_mode)
     p = subprocess.run(
-        "./shotgun_test.sh --extended_window_mode", shell=True, check=True, cwd=cwd
+        "../shotgun_test.sh --extended_window_mode", shell=True, check=True, cwd=cwd_test_e2e_extended_window_mode
     )
 
     assert p.returncode == 0
-    assert os.path.isfile(os.path.join(cwd, f_path)) == True
+    assert os.path.isfile(os.path.join(cwd_test_e2e_extended_window_mode, f_path)) == True
diff --git a/viloca/b2w.py b/viloca/b2w.py
@@ -7,6 +7,7 @@
 from multiprocessing import Process, cpu_count
 import os
 import hashlib
+from pathlib import Path
 
 def _write_to_file(lines, file_name):
     with open(file_name, "w") as f:
@@ -340,7 +341,7 @@ def parallel_run_one_window(
     indel_map,
     max_ins_at_pos,
     extended_window_mode,
-    exclude_non_var_pos_threshold
+    exclude_non_var_pos_threshold,
 ):
     """
     build one window.
@@ -510,7 +511,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
     exact_conformance_fix_0_1_basing_in_reads: Optional[bool] = False,
     extended_window_mode: Optional[bool] = False,
     exclude_non_var_pos_threshold: Optional[float] = -1,
-    maxthreads: Optional[int] = 1) -> None:
+    maxthreads: Optional[int] = 1,
+    reuse_files = False) -> None:
     """Summarizes reads aligned to reference into windows.
     Three products are created:
     #. Multiple FASTA files (one for each window position)
@@ -580,7 +582,7 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
     all_processes = []
     for idx, (window_start, window_length, control_window_length) in enumerate(tiling):
         counter = counter_list[idx]
-        if not os.path.isfile(f"coverage_{idx}.txt"):
+        if not (reuse_files and os.path.isfile(f"coverage_{idx}.txt")):
             p = Process(
                 target=parallel_run_one_window,
                 args=(
@@ -605,6 +607,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
                     )
                 )
             all_processes.append(p)
+        else:
+            logging.info(f'[file already exits] Use window files generated on {time.ctime(Path(f"coverage_{idx}.txt").stat().st_mtime)}')
 
     for p in all_processes:
       p.start()

diff --git a/viloca/cli.py b/viloca/cli.py
@@ -200,6 +200,13 @@ def main():
 
     parser_shotgun.add_argument('--exclude_non_var_pos_threshold', metavar='FLOAT', type=float, dest="exclude_non_var_pos_threshold",
                                 default=-1, help="Runs exclude non-variable positions mode. Set percentage threshold for exclusion.")
+
+    parser_shotgun.add_argument('--reuse_files', action='store_true', dest="reuse_files",
+                                default=-1, help="Enabling this option allows the command line tool to reuse files that were generated in previous runs. When set to true, the tool will check for existing output files and reuse them instead of regenerating the data. This can help improve performance by avoiding redundant file generation processes.")
+
+    parser_shotgun.add_argument('--record_history', action='store_true', dest="record_history",
+                                default=-1, help="When enabled, this option saves the history of the parameter values learned during the inference process.")
+
     parser_shotgun.add_argument("--min_windows_coverage", metavar='INT', type=int,
                                 required=False, default=2, dest="min_windows_coverage",
                                 help="Number of windows that need to cover a mutation to have it called.")

diff --git a/viloca/local_haplotype_inference/learn_error_params/cavi.py b/viloca/local_haplotype_inference/learn_error_params/cavi.py
@@ -34,6 +34,7 @@ def multistart_cavi(
     reads_weights,
     n_starts,
     output_dir,
+    record_history
 ):
 
     pool = mp.Pool(mp.cpu_count())
@@ -51,6 +52,7 @@ def multistart_cavi(
                 reads_weights,
                 start,
                 output_dir,
+                record_history
             ),
             callback=collect_result,
         )
@@ -72,6 +74,7 @@ def run_cavi(
     reads_weights,
     start_id,
     output_dir,
+    record_history
 ):
 
     """
@@ -85,16 +88,7 @@ def run_cavi(
         "alphabet": alphabet,
     }  # N= #reads, K= #components
 
-    history_alpha = []
-    history_mean_log_pi = []
-    history_theta_c = []
-    history_theta_d = []
-    history_mean_log_theta = []
-    history_gamma_a = []
-    history_gamma_b = []
-    history_mean_log_gamma = []
-    history_mean_haplo = []
-    history_mean_cluster = []
+
     history_elbo = []
 
     state_init_dict = initialization.draw_init_state(
@@ -130,8 +124,8 @@ def run_cavi(
     converged = False
     elbo = 0
     state_curr_dict = state_init_dict
-    k = 0
-    while converged is False:
+    min_number_iterations = 10
+    while (converged is False) or (iter < min_number_iterations):
 
         if iter <= 1:
             digamma_alpha_sum = digamma(state_curr_dict["alpha"].sum(axis=0))
@@ -160,7 +154,7 @@ def run_cavi(
             state_init_dict,
             state_curr_dict,
         )
-        
+
         if iter % 2 == 0:
             history_elbo.append(elbo)
             history_mean_log_pi.append(state_curr_dict["mean_log_pi"])
@@ -184,11 +178,11 @@ def run_cavi(
                 break
             elif np.abs(elbo - history_elbo[-2]) < 1e-03:
                 converged = True
-                k += 1
+                iter += 1
                 message = "ELBO converged."
                 exitflag = 0
             else:
-                k = 0
+                iter = 0
 
         # if k%10==0: # every 10th parameter set is saved to history
         state_curr_dict.update({"elbo": elbo})
@@ -198,28 +192,41 @@ def run_cavi(
 
     state_curr_dict.update({"elbo": elbo})
 
-    dict_result.update(
-        {
-            "exit_message": message,
-            "exitflag": exitflag,
-            "n_iterations": iter,
-            "converged": converged,
-            "elbo": elbo,
-            "history_mean_log_theta": history_mean_log_theta,
-            "history_elbo": history_elbo,
-            "history_alpha": history_alpha,
-            "history_mean_log_pi": history_mean_log_pi,
-            "history_theta_c": history_theta_c,
-            "history_alpha": history_alpha,
-            "history_theta_d": history_theta_d,
-            "history_mean_log_theta": history_mean_log_theta,
-            "history_gamma_a": history_gamma_a,
-            "history_gamma_b": history_gamma_b,
-            "history_mean_log_gamma": history_mean_log_gamma,
-            "history_mean_haplo": history_mean_haplo,
-            "history_mean_cluster": history_mean_cluster,
-        }
-    )
+    if record_history:
+        dict_result.update(
+            {
+                "exit_message": message,
+                "exitflag": exitflag,
+                "n_iterations": iter,
+                "converged": converged,
+                "elbo": elbo,
+                "history_mean_log_theta": history_mean_log_theta,
+                "history_elbo": history_elbo,
+                "history_alpha": history_alpha,
+                "history_mean_log_pi": history_mean_log_pi,
+                "history_theta_c": history_theta_c,
+                "history_alpha": history_alpha,
+                "history_theta_d": history_theta_d,
+                "history_mean_log_theta": history_mean_log_theta,
+                "history_gamma_a": history_gamma_a,
+                "history_gamma_b": history_gamma_b,
+                "history_mean_log_gamma": history_mean_log_gamma,
+                "history_mean_haplo": history_mean_haplo,
+                "history_mean_cluster": history_mean_cluster,
+            }
+        )
+    else:
+        dict_result.update(
+            {
+                "exit_message": message,
+                "exitflag": exitflag,
+                "n_iterations": iter,
+                "converged": converged,
+                "elbo": elbo,
+                "history_elbo": history_elbo,
+            }
+        )
+
 
     # dict_result.update(state_curr_dict)
     summary = analyze_results.summarize_results(

diff --git a/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py b/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py
@@ -29,7 +29,7 @@ def gzip_file(f_name):
     return f_out.name
 
 
-def main(freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet="ACGT-", unique_modus=False):
+def main(freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet="ACGT-", unique_modus=True, record_history=False):
 
     window_id = freads_in.split("/")[-1][:-4]  # freads_in is absolute path
 
@@ -39,7 +39,7 @@ def main(freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet="ACGT-",
     # Read in reads
     reference_seq, ref_id = preparation.load_reference_seq(fref_in)
     reference_binary = preparation.reference2binary(reference_seq, alphabet)
-    reads_list = preparation.load_fasta2reads_list(freads_in, alphabet, False)
+    reads_list = preparation.load_fasta2reads_list(freads_in, alphabet, unique_modus)
     reads_seq_binary, reads_weights = preparation.reads_list_to_array(reads_list)
 
     if n_starts >1:
@@ -88,12 +88,12 @@ def main(freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet="ACGT-",
 
     logging.info("CAVI termination " + str(sort_results[0][2]["exit_message"]))
 
-    with open(output_name + "all_results.pkl", "wb") as f2:
-        pickle.dump(sort_results, f2)
-
-    logging.info(
-        "Results dicts of all runs written to " + output_name + "all_results.pkl"
-    )
+    if record_history:
+        with open(output_name + "all_results.pkl", "wb") as f2:
+            pickle.dump(sort_results, f2)
+        logging.info(
+            "Results dicts of all runs written to " + output_name + "all_results.pkl"
+        )
 
     state_curr_dict = result_list[max_idx][1]
     logging.info("Maximal ELBO " + str(max_elbo) + "in run " + str(max_idx))