Update command line parameters (#32)

* working multiprocessing * counter and control_window_length are now computed in advance of _run_one_window() * [test] test_run_one_window had to be adapted as it has now also the control_window_length as input * [test] test_run_one_window had to be adapted as it has now also the control_window_length as input * [test] correct expected output * [test] computation control_window_length * fixed: problem with hash not being deterministic which caused problems in the multi-threading, now using a determinisitc version * clean up * correct indent * random hash to deterministic hash * update multiprocessing modules that are needed * fix: computation control window length * delete empty log file * delete empty log file * shotgun command to run command * update log file naming * [README] add channels to conda enviroment installation] * updated readme with new commands * update imported packages to viloca, and other naming * update viloca naming here * update viloca naming here * [unit tests] update commands * update author * build is needed for libshorah * updated readme
cbg-ethz · Nov 27, 2023 · 39fc4c9 · 39fc4c9
1 parent 6835ab1
commit 39fc4c9
Show file tree

Hide file tree

Showing 38 changed files with 830 additions and 355 deletions.
diff --git a/README.md b/README.md
@@ -1,9 +1,5 @@
 VILOCA: VIral LOcal haplotype reconstruction and mutation CAlling for short and long read data
 ===============
-[![Build Status](https://travis-ci.org/cbg-ethz/shorah.svg?branch=master)](https://travis-ci.org/cbg-ethz/shorah)
-[![Bioconda package](https://img.shields.io/conda/dn/bioconda/shorah.svg?label=Bioconda)](https://bioconda.github.io/recipes/shorah/README.html)
-[![Docker container](https://quay.io/repository/biocontainers/shorah/status)](https://quay.io/repository/biocontainers/shorah)
-
 
 VILOCA is an open source project for the analysis of next generation sequencing
 data. It is designed to analyse genetically heterogeneous samples. Its tools
@@ -17,22 +13,26 @@ genetic variants present in a mixed sample.
 For installation miniconda is recommended: https://docs.conda.io/en/latest/miniconda.html.
 We recommend to install VILOCA in a clean conda environment:
 ```
-conda create --name env_viloca libshorah
+conda create --name env_viloca --channel conda-forge --channel bioconda libshorah
 conda activate env_viloca
-pip install git+https://github.com/LaraFuhrmann/VILOCA@master
+pip install git+https://github.com/cbg-ethz/VILOCA@master
 ```
 
 ### Example
-To test your installation, we recommend running the program on `tests/data_1`.
+To test your installation run VILOCA `tests/data_1`:
+```
+viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode use_quality_scores
+```
+
 
 If the sequencing amplicon strategy is known, we recommend using the amplicon-mode of the program, which takes as input the `<smth>.insert.bed` - file:
-`shorah shotgun -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode use_quality_scores`
+`viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode use_quality_scores`
 
 If the sequencing quality scores are not trustable, the sequencing error parameters can also be learned:
-`shorah shotgun -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode learn_error_params`.
+`viloca run -b test_aln.cram -f test_ref.fasta -z scheme.insert.bed --mode learn_error_params`.
 
 If there is no information on the sequencing amplicon strategy available, run:
-`shorah shotgun -b test_aln.cram -f test_ref.fasta --mode use_quality_scores`
+`viloca run -b test_aln.cram -f test_ref.fasta --mode use_quality_scores`
 
 ### Parameters
 There are several parameters available:  

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,16 +1,16 @@
 [tool.poetry]
-name = "ShoRAH"
+name = "VILOCA"
 version = "0.1.0"
 description = "SHOrt Reads Assembly into Haplotypes"
 license = "GPL-3.0-only"
-authors = ["Benjamin Langer <[email protected]>"]
+authors = ["Benjamin Langer <[email protected]>, Lara Fuhrmann <[email protected]>"]
 build = "build.py"
 packages = [
-    { include = "shorah" }
+    { include = "viloca" }
 ]
 
 [tool.poetry.scripts]
-shorah = 'shorah.cli:main'
+viloca = 'viloca.cli:main'
 
 [tool.poetry.dependencies]
 python = ">=3.9.9,<3.11"

diff --git a/tests/data_1/shotgun_test.sh b/tests/data_1/shotgun_test.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
-shorah shotgun -a 0.1 -w 201 -x 100000 -p 0.9 -c 0 \
+viloca run -a 0.1 -w 201 -x 100000 -p 0.9 -c 0 \
 -r HXB2:2469-3713 -R 42 -f test_ref.fasta -b test_aln.cram --out_format csv "$@"
diff --git a/tests/data_5/shotgun_prepare.sh b/tests/data_5/shotgun_prepare.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-shorah shotgun -a 0.1 -w 42 -x 100000 -p 0.9 -c 0 -r REF:43-273 -R 42 -b test_aln.cram -f ref.fasta
+viloca run -a 0.1 -w 42 -x 100000 -p 0.9 -c 0 -r REF:43-273 -R 42 -b test_aln.cram -f ref.fasta
diff --git a/tests/test_b2w.py b/tests/test_b2w.py
@@ -2,7 +2,7 @@
 import filecmp
 import os
 import glob
-from shorah import b2w, tiling
+from viloca import b2w, tiling
 import math
 import libshorah
 

diff --git a/tests/test_b2w_mapping.py b/tests/test_b2w_mapping.py
@@ -1,7 +1,8 @@
 from array import array
 import pytest
 from cigar import Cigar
-from shorah import b2w
+from viloca import b2w
+import hashlib
 
 class MockAlignedSegment:
     def __init__(self, query_name: str, reference_start: int, query_sequence: str, cigarstring: str):
@@ -35,10 +36,10 @@ def add_indels(self, indels_map):
         cnt = self.reference_start
         for i in self.cigartuples:
             if i[0] == 1: # insert TODO Justify -1
-                indels_map.append((self.query_name, self.reference_start, hash(self.cigarstring), cnt-1, i[1], 0)) # cnt-1
+                indels_map.append((self.query_name, self.reference_start, hashlib.sha1(self.cigarstring.encode()).hexdigest(), cnt-1, i[1], 0)) # cnt-1
             elif i[0] == 2: # del
                 for k in range(i[1]):
-                    indels_map.append((self.query_name, self.reference_start, hash(self.cigarstring), cnt+k, 0, 1))
+                    indels_map.append((self.query_name, self.reference_start, hashlib.sha1(self.cigarstring.encode()).hexdigest(), cnt+k, 0, 1))
                 cnt += i[1]
             else:
                 cnt += i[1]
@@ -326,11 +327,22 @@ def test_run_one_window(mArr, spec, window_length, window_start, extended_window
     mock_dict = mocker.MagicMock()
     mock_dict.__getitem__.return_value = 42
 
-    arr, _, _, _, _, _ = b2w._run_one_window(
+    # added by Lara
+    original_window_length = window_length
+    control_window_length = window_length
+
+    if extended_window_mode:
+        for pos, val in max_indel_at_pos.items():
+            if window_start <= pos < window_start + original_window_length:
+                control_window_length += val
+
+
+    arr, _, _, _, = b2w._run_one_window(
         mock_samfile,
-        window_start,
+        window_start, # 0 based
         "HXB2-does-not-matter",
         window_length,
+        control_window_length,
         0,
         mock_dict,
         0,
@@ -343,4 +355,4 @@ def test_run_one_window(mArr, spec, window_length, window_start, extended_window
     print(arr)
 
     for idx, el in enumerate(arr):
-        assert el.split("\n")[1] == spec[idx]
+        assert el.split("\n")[1] == spec[idx]
diff --git a/tests/test_envp_post.py b/tests/test_envp_post.py
@@ -1,5 +1,5 @@
 from unittest.mock import patch, mock_open
-from shorah import envp_post
+from viloca import envp_post
 
 DEFAULT_MOCK_DATA = "default mock data"
 

diff --git a/tests/test_pooled_post.py b/tests/test_pooled_post.py
@@ -1,5 +1,5 @@
 from unittest.mock import patch, mock_open
-from shorah import pooled_post
+from viloca import pooled_post
 import numpy as np
 
 DEFAULT_MOCK_DATA = "default mock data"
@@ -48,4 +48,4 @@ def open_side_effect(name):
 #                                      open("debug/w-HXB2-2938-3138.dbg"),
 #                                      open("support/w-HXB2-2938-3138.reads-support.fas"),
 #                                      open("corrected/w-HXB2-2938-3138.reads-cor.fas"),
-#                                      "shorah") # TODO
+#                                      "shorah") # TODO
diff --git a/tests/test_pooled_pre.py b/tests/test_pooled_pre.py
@@ -1,6 +1,6 @@
 import pysam
 import os
-from shorah import pooled_pre
+from viloca import pooled_pre
 
 def test__annotate_alignment_file():
     out = "out.bam"
@@ -48,4 +48,4 @@ def test_pre_process_pooled():
     os.remove(out + ".bai")
 
     assert a[0] == a[1] != 0
-    assert a[2] == 0
+    assert a[2] == 0
diff --git a/tests/test_shorah_snv.py b/tests/test_shorah_snv.py
@@ -1,5 +1,5 @@
 import pytest
-from shorah.shorah_snv import _compare_ref_to_read, SNP_id, SNV
+from viloca.shorah_snv import _compare_ref_to_read, SNP_id, SNV
 
 
 @pytest.mark.parametrize("ref, seq, spec", [
@@ -39,4 +39,4 @@ def test_compare_ref_to_read(ref, seq, spec):
 
     assert snp == spec
 
-    assert tot_snv == len(snp)
+    assert tot_snv == len(snp)
diff --git a/tests/test_tiling.py b/tests/test_tiling.py
@@ -1,4 +1,4 @@
-from shorah import tiling
+from viloca import tiling
 import pytest
 
 def test_equispaced():

diff --git a/shorah/__init__.py → viloca/__init__.py b/shorah/__init__.py → viloca/__init__.py
diff --git a/shorah/__main__.py → viloca/__main__.py b/shorah/__main__.py → viloca/__main__.py
@@ -8,7 +8,7 @@
 - https://docs.python.org/2/using/cmdline.html#cmdoption-m
 - https://docs.python.org/3/using/cmdline.html#cmdoption-m
 """
-from shorah.cli import main
+from viloca.cli import main
 
 if __name__ == "__main__":
     main()