Skip to content

Commit

Permalink
Run e2e-tests in separate dir + reuse_files + record_history (#41)
Browse files Browse the repository at this point in the history
* b2w and inference: rerun windows that failed + check if window was processed already before adding process to process list

* reference to manuscript

* restarting failed process doesn't work, exit again

* fix: check if files already exist

* add minimum number of iterations

* pass unique_modus argument

* Revert "Merge pull request #38 from cbg-ethz/enable-unique-modus"

This reverts commit ffd4fd1, reversing
changes made to 279cc63.

* rebase to master

* reference to manuscript

* restarting failed process doesn't work, exit again

* add minimum number of iterations

* pass unique_modus argument

* Revert "Merge pull request #38 from cbg-ethz/enable-unique-modus"

This reverts commit ffd4fd1, reversing
changes made to 279cc63.

* update readme

* no poerty lock flie

* add param record_history

* add command line params: reuse_files + record_history

* pass record_history + implement reuse_files

* implemnt reuse_files

* typo

* add param record_history

* fix syntax

* [fix[ unexpected keyword argument 'metavar'

* fix ctime

* add pathlib package

* fix the pathlib import

* fix: too many argmuments to unpack

* fix params to unpack

* [e2e tests] run the e2e tests in seperate directories

* partially addresses #40

* enable unique_modus
  • Loading branch information
LaraFuhrmann authored Dec 3, 2024
1 parent 65e6058 commit aebc82d
Show file tree
Hide file tree
Showing 11 changed files with 198 additions and 895 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ RUN apt-get update -y && \

RUN pip install "poetry==$POETRY_VERSION"

COPY pyproject.toml poetry.lock /usr/app/
COPY pyproject.toml /usr/app/

# https://stackoverflow.com/questions/53835198/integrating-python-poetry-with-docker
RUN cd /usr/app && poetry install --no-interaction --no-ansi --no-root

# GitHub Actions chimes in here and sets docker's WORKDIR=${GITHUB_WORKSPACE}
# https://docs.github.com/en/actions/creating-actions/dockerfile-support-for-github-actions#workdir
# poetry install --only-root would be more elegant but does not work in Github Actions
CMD poetry install --no-interaction --no-ansi && cd ./tests && poetry run pytest
CMD poetry install --no-interaction --no-ansi && cd ./tests && poetry run pytest
743 changes: 0 additions & 743 deletions poetry.lock

This file was deleted.

2 changes: 1 addition & 1 deletion tests/data_1/shotgun_test.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash

viloca run -a 0.1 -w 201 --mode shorah -x 100000 -p 0.9 -c 0 \
-r HXB2:2469-3713 -R 42 -f test_ref.fasta -b test_aln.cram --out_format csv "$@"
-r HXB2:2469-3713 -R 42 -f ../test_ref.fasta -b ../test_aln.cram --out_format csv "$@"
17 changes: 13 additions & 4 deletions tests/test_shotgun_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,31 @@ def run(base_files):
run(base_files)

def test_e2e_shorah():
cwd_test_e2e = "./data_1/test_e2e_shorah"
# Check if the directory exists, if not, create it
if not os.path.exists(cwd_test_e2e):
os.makedirs(cwd_test_e2e)

original = subprocess.run(
"./shotgun_test.sh", shell=True, check=True, cwd=cwd
"../shotgun_test.sh", shell=True, check=True, cwd=cwd_test_e2e
)
assert original.returncode == 0
assert os.path.isfile(os.path.join(cwd, f_path)) == False

assert filecmp.cmp(
"./data_1/test.csv",
"./data_1/work/snv/SNVs_0.010000_final.csv",
"./data_1/test_e2e_shorah/work/snv/SNVs_0.010000_final.csv",
shallow=False
)

def test_e2e_shorah_with_extended_window_mode():
cwd_test_e2e_extended_window_mode = "./data_1/test_e2e_extended_window_modee"
# Check if the directory exists, if not, create it
if not os.path.exists(cwd_test_e2e_extended_window_mode):
os.makedirs(cwd_test_e2e_extended_window_mode)
p = subprocess.run(
"./shotgun_test.sh --extended_window_mode", shell=True, check=True, cwd=cwd
"../shotgun_test.sh --extended_window_mode", shell=True, check=True, cwd=cwd_test_e2e_extended_window_mode
)

assert p.returncode == 0
assert os.path.isfile(os.path.join(cwd, f_path)) == True
assert os.path.isfile(os.path.join(cwd_test_e2e_extended_window_mode, f_path)) == True
10 changes: 7 additions & 3 deletions viloca/b2w.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from multiprocessing import Process, cpu_count
import os
import hashlib
from pathlib import Path

def _write_to_file(lines, file_name):
with open(file_name, "w") as f:
Expand Down Expand Up @@ -340,7 +341,7 @@ def parallel_run_one_window(
indel_map,
max_ins_at_pos,
extended_window_mode,
exclude_non_var_pos_threshold
exclude_non_var_pos_threshold,
):
"""
build one window.
Expand Down Expand Up @@ -510,7 +511,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
exact_conformance_fix_0_1_basing_in_reads: Optional[bool] = False,
extended_window_mode: Optional[bool] = False,
exclude_non_var_pos_threshold: Optional[float] = -1,
maxthreads: Optional[int] = 1) -> None:
maxthreads: Optional[int] = 1,
reuse_files = False) -> None:
"""Summarizes reads aligned to reference into windows.
Three products are created:
#. Multiple FASTA files (one for each window position)
Expand Down Expand Up @@ -580,7 +582,7 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
all_processes = []
for idx, (window_start, window_length, control_window_length) in enumerate(tiling):
counter = counter_list[idx]
if not os.path.isfile(f"coverage_{idx}.txt"):
if not (reuse_files and os.path.isfile(f"coverage_{idx}.txt")):
p = Process(
target=parallel_run_one_window,
args=(
Expand All @@ -605,6 +607,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
)
)
all_processes.append(p)
else:
logging.info(f'[file already exits] Use window files generated on {time.ctime(Path(f"coverage_{idx}.txt").stat().st_mtime)}')

for p in all_processes:
p.start()
Expand Down
7 changes: 7 additions & 0 deletions viloca/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,13 @@ def main():

parser_shotgun.add_argument('--exclude_non_var_pos_threshold', metavar='FLOAT', type=float, dest="exclude_non_var_pos_threshold",
default=-1, help="Runs exclude non-variable positions mode. Set percentage threshold for exclusion.")

parser_shotgun.add_argument('--reuse_files', action='store_true', dest="reuse_files",
default=-1, help="Enabling this option allows the command line tool to reuse files that were generated in previous runs. When set to true, the tool will check for existing output files and reuse them instead of regenerating the data. This can help improve performance by avoiding redundant file generation processes.")

parser_shotgun.add_argument('--record_history', action='store_true', dest="record_history",
default=-1, help="When enabled, this option saves the history of the parameter values learned during the inference process.")

parser_shotgun.add_argument("--min_windows_coverage", metavar='INT', type=int,
required=False, default=2, dest="min_windows_coverage",
help="Number of windows that need to cover a mutation to have it called.")
Expand Down
81 changes: 44 additions & 37 deletions viloca/local_haplotype_inference/learn_error_params/cavi.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def multistart_cavi(
reads_weights,
n_starts,
output_dir,
record_history
):

pool = mp.Pool(mp.cpu_count())
Expand All @@ -51,6 +52,7 @@ def multistart_cavi(
reads_weights,
start,
output_dir,
record_history
),
callback=collect_result,
)
Expand All @@ -72,6 +74,7 @@ def run_cavi(
reads_weights,
start_id,
output_dir,
record_history
):

"""
Expand All @@ -85,16 +88,7 @@ def run_cavi(
"alphabet": alphabet,
} # N= #reads, K= #components

history_alpha = []
history_mean_log_pi = []
history_theta_c = []
history_theta_d = []
history_mean_log_theta = []
history_gamma_a = []
history_gamma_b = []
history_mean_log_gamma = []
history_mean_haplo = []
history_mean_cluster = []

history_elbo = []

state_init_dict = initialization.draw_init_state(
Expand Down Expand Up @@ -130,8 +124,8 @@ def run_cavi(
converged = False
elbo = 0
state_curr_dict = state_init_dict
k = 0
while converged is False:
min_number_iterations = 10
while (converged is False) or (iter < min_number_iterations):

if iter <= 1:
digamma_alpha_sum = digamma(state_curr_dict["alpha"].sum(axis=0))
Expand Down Expand Up @@ -160,7 +154,7 @@ def run_cavi(
state_init_dict,
state_curr_dict,
)

if iter % 2 == 0:
history_elbo.append(elbo)
history_mean_log_pi.append(state_curr_dict["mean_log_pi"])
Expand All @@ -184,11 +178,11 @@ def run_cavi(
break
elif np.abs(elbo - history_elbo[-2]) < 1e-03:
converged = True
k += 1
iter += 1
message = "ELBO converged."
exitflag = 0
else:
k = 0
iter = 0

# if k%10==0: # every 10th parameter set is saved to history
state_curr_dict.update({"elbo": elbo})
Expand All @@ -198,28 +192,41 @@ def run_cavi(

state_curr_dict.update({"elbo": elbo})

dict_result.update(
{
"exit_message": message,
"exitflag": exitflag,
"n_iterations": iter,
"converged": converged,
"elbo": elbo,
"history_mean_log_theta": history_mean_log_theta,
"history_elbo": history_elbo,
"history_alpha": history_alpha,
"history_mean_log_pi": history_mean_log_pi,
"history_theta_c": history_theta_c,
"history_alpha": history_alpha,
"history_theta_d": history_theta_d,
"history_mean_log_theta": history_mean_log_theta,
"history_gamma_a": history_gamma_a,
"history_gamma_b": history_gamma_b,
"history_mean_log_gamma": history_mean_log_gamma,
"history_mean_haplo": history_mean_haplo,
"history_mean_cluster": history_mean_cluster,
}
)
if record_history:
dict_result.update(
{
"exit_message": message,
"exitflag": exitflag,
"n_iterations": iter,
"converged": converged,
"elbo": elbo,
"history_mean_log_theta": history_mean_log_theta,
"history_elbo": history_elbo,
"history_alpha": history_alpha,
"history_mean_log_pi": history_mean_log_pi,
"history_theta_c": history_theta_c,
"history_alpha": history_alpha,
"history_theta_d": history_theta_d,
"history_mean_log_theta": history_mean_log_theta,
"history_gamma_a": history_gamma_a,
"history_gamma_b": history_gamma_b,
"history_mean_log_gamma": history_mean_log_gamma,
"history_mean_haplo": history_mean_haplo,
"history_mean_cluster": history_mean_cluster,
}
)
else:
dict_result.update(
{
"exit_message": message,
"exitflag": exitflag,
"n_iterations": iter,
"converged": converged,
"elbo": elbo,
"history_elbo": history_elbo,
}
)


# dict_result.update(state_curr_dict)
summary = analyze_results.summarize_results(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def gzip_file(f_name):
return f_out.name


def main(freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet="ACGT-", unique_modus=False):
def main(freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet="ACGT-", unique_modus=True, record_history=False):

window_id = freads_in.split("/")[-1][:-4] # freads_in is absolute path

Expand All @@ -39,7 +39,7 @@ def main(freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet="ACGT-",
# Read in reads
reference_seq, ref_id = preparation.load_reference_seq(fref_in)
reference_binary = preparation.reference2binary(reference_seq, alphabet)
reads_list = preparation.load_fasta2reads_list(freads_in, alphabet, False)
reads_list = preparation.load_fasta2reads_list(freads_in, alphabet, unique_modus)
reads_seq_binary, reads_weights = preparation.reads_list_to_array(reads_list)

if n_starts >1:
Expand Down Expand Up @@ -88,12 +88,12 @@ def main(freads_in, fref_in, output_dir, n_starts, K, alpha0, alphabet="ACGT-",

logging.info("CAVI termination " + str(sort_results[0][2]["exit_message"]))

with open(output_name + "all_results.pkl", "wb") as f2:
pickle.dump(sort_results, f2)

logging.info(
"Results dicts of all runs written to " + output_name + "all_results.pkl"
)
if record_history:
with open(output_name + "all_results.pkl", "wb") as f2:
pickle.dump(sort_results, f2)
logging.info(
"Results dicts of all runs written to " + output_name + "all_results.pkl"
)

state_curr_dict = result_list[max_idx][1]
logging.info("Maximal ELBO " + str(max_elbo) + "in run " + str(max_idx))
Expand Down
Loading

0 comments on commit aebc82d

Please sign in to comment.