-
Notifications
You must be signed in to change notification settings - Fork 1
/
eda.smk
63 lines (47 loc) · 2.28 KB
/
eda.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
configfile: "params-grid-exp.yaml"
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from src.utils import MonitorValuesPlus
from src.msa import AnalyzerMSA
from tqdm import tqdm
# --- EDA MSA
rule eda_msa:
input:
path_msas=config["PATH_MSAS"]
output:
expand("{path_output}/analysis-msa/stats_msas.tsv", path_output=config["PATH_OUTPUT"]),
expand("{path_output}/analysis-msa/problematic_files.tsv", path_output=config["PATH_OUTPUT"])
run:
path_output=config["PATH_OUTPUT"]
mv = MonitorValuesPlus(list_vars=["path_msa","n_seqs","n_unique_seqs","n_identical_cols","n_cols", "perc_identical_cols"],
out_file=f"{path_output}/analysis-msa/stats_msas.tsv",
overwrite=True)
mv_problems = MonitorValuesPlus(list_vars=["path_msa"],
out_file=f"{path_output}/analysis-msa/problematic_files.tsv",
overwrite=True)
analyzer = AnalyzerMSA()
def run(path_msa):
"Function to run with ThreadPoolExecutor"
try:
n_cols, n_seqs, n_unique_seqs, n_identical_cols = analyzer(path_msa)
perc_identical_cols = round(100*n_identical_cols/n_cols, 2)
mv()
except:
mv_problems()
# list of (path to) msas
# list_paths = list(Path(config["PATH_MSAS"]).glob("*.[fa]*"))
# path msas
SUBSET_HLA = ["A-3105", "B-3106", "C-3107", "DQA1-3117", "DQB1-3119", "DRB1-3123"]
MSAS = list(Path(config["PATH_MSAS"]).glob("*.[fa]*"))
list_paths = [path for path in MSAS if path.stem in SUBSET_HLA]
print(MSAS)
with ThreadPoolExecutor(max_workers=16) as pool:
with tqdm(total=len(list_paths)) as progress:
progress.set_description("Analyzing MSA")
futures=[]
for path_msa in list_paths:
future = pool.submit(run, path_msa)
future.add_done_callback(lambda p: progress.update())
futures.append(future)
for future in futures:
future.result()