-
Notifications
You must be signed in to change notification settings - Fork 3
/
megahit_graph.snake
117 lines (103 loc) · 4.65 KB
/
megahit_graph.snake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#from Bio.SeqIO.FastaIO import SimpleFastaParser as SFP
rule create_fastg:
input : "{group}/contigs/contigs.fa"
output: temp("{group}/graph/contigs.fastg")
log : "{group}/graph/fastg.log"
conda : CONDA_ENV + "/megahit.yaml"
singularity : "docker://quay.io/annacprice/megahit:1.2.9"
# I need the kmer length, if I put this in params, snakemake will ask for the value to be clear before the start of the pipeline, which is not possible, since we need at least the assembly.
# also, if you try to store the value inside kmer_len, it will fail due to set -o.
shell : """
# kmer_len=$(cut -d "_" -f -1 {input} |grep -m1 k | sed -E 's/(>k)+//')
# echo $kmer_len
megahit_toolkit contig2fastg $(cut -d "_" -f -1 {input} |grep -m1 k | sed -E 's/(>k)+//') {input} > {output} 2>{log}
"""
rule create_gfa:
input: "{file}.fastg"
output: temp("{file}_bad_name.gfa")
log : "{file}.log"
conda : CONDA_ENV + "/bandage.yaml"
singularity : "docker://quay.io/annacprice/bandage:0.8.1"
shell : "Bandage reduce {input} {output} 2>{log}"
rule rename_gfa:
input: gfa="{group}/graph/contigs_bad_name.gfa",
fa="{group}/contigs/contigs.fa"
output: "{group}/graph/contigs.gfa"
conda : CONDA_ENV + "/pythonenv.yaml"
singularity : "docker://quay.io/annacprice/pythonenv:3.9"
shell : "{SCRIPTS}/Rename_gfa.py {input.gfa} {input.fa} {output}"
rule subgraph:
input: gfa="{group}/graph/contigs.gfa",
annotation="{group}/annotation/contigs_{DB}_best_hits.tsv"
output:"{group}/graph/contigs.{DB}.gfa"
conda : CONDA_ENV + "/pythonenv.yaml"
singularity : "docker://quay.io/annacprice/pythonenv:3.9"
shell:"{SCRIPTS}/Graph_selection.py {input.annotation} {input.gfa} component -o {output}"
rule subgraph_contiguous:
input: gfa="{group}/graph/{name}.gfa",
annotation="{group}/annotation/contigs_{DB}_best_hits.tsv"
output:"{group}/graph/{name}.{DB}_contiguous.gfa"
conda : CONDA_ENV + "/pythonenv.yaml"
singularity : "docker://quay.io/annacprice/pythonenv:3.9"
shell:"{SCRIPTS}/Graph_selection.py {input.annotation} {input.gfa} contiguous -o {output}"
rule orf_graph:
input: gfa="{group}/graph/{name}.gfa",
bed="{group}/annotation/orf.bed"
output:"{group}/graph/{name}_ORF.gfa"
conda : CONDA_ENV + "/pythonenv.yaml"
singularity : "docker://quay.io/annacprice/pythonenv:3.9"
shell:"{SCRIPTS}/Build_ORF_graph.py {input.bed} {input.gfa} {output}"
rule graph_annotation:
input: gfa="{group}/graph/{name}.gfa",
List_annotation=expand("{group}/annotation/contigs_{annotation}_best_hits.tsv", annotation=DIAMOND,group=GROUPS)
output:
colors="{group}/graph/{name}.colors",
graph_annotation=temp("{group}/graph/{name}_annotation.tmp")
run:
Handle=open(output["colors"],"w")
Handle.write('\n'.join(["\t".join([wildcards.group+"/annotation/contigs_"+db+".tsv",color]) for db,color in DB_COLORS.items()]))
Handle.close()
shell("{SCRIPTS}/Generate_Graph_metadata.py {input.gfa} {output.colors} {output.graph_annotation}")
# rule gfa_to_fasta:
# input: gfa="{name}_contiguous.gfa"
# output: contigs="{name}_contigs.fa"
# run :
# Handle=open(output["contigs"],"w")
# for line in open(input["gfa"]) :
# if line[0]=="S" :
# Splitline=line.split("\t")
# Handle.write(">"+Splitline[1]+"\n"+Splitline[2]+"\n")
# Handle.close()
# rule CAT_graph_annotation:
# input: contigs="{group}/graph/{name}_contigs.fa"
# output: ORF2LCA="{group}/graph/CAT/{name}.ORF2LCA.txt"
# params : Dir="{group}/graph/CAT/{name}"
# threads: 1000
# shell : """
# CAT contigs -c {input.contigs} -d {CAT_DB} -t {CAT_DB} -n {threads} --out_prefix {params.Dir} --top 11 --I_know_what_Im_doing --force
# """
# rule CAT_ORF_graph_annotation:
# input: ORF2LCA="{group}/graph/CAT/{name}.ORF2LCA.txt"
# output: annotation="{group}/graph/{name}.cat",
# done=touch("{group}/graph/{name}.taxadone")
# params : Dir="{group}/graph/CAT/{name}"
# threads: 1000
# shell : """
# CAT add_names -i {input.ORF2LCA} -o {output.annotation} -t {CAT_DB} --only_official
# """
rule concatenate_annotation:
input: graph_annotation="{group}/graph/{name}_contiguous_ORF_annotation.tmp"
output: annotation="{group}/graph/{name}_contiguous_ORF_annotation.csv"
params : Dir="{group}/graph/CAT/{name}"
run :
Handle=open(output["annotation"],"w")
Cat_annotation={line.split('\t')[0]:line.rstrip().split('\t')[3:] for line in open(input["cat"])}
Handle_graph=open(input["graph_annotation"])
Handle.write(next(Handle_graph).rstrip()+","+",".join(Cat_annotation["# ORF"])+"\n")
for line in Handle_graph :
orf=line.split(",")[0]
Annotation=["","","","","","",""]
if orf in Cat_annotation :
Annotation=map(lambda x:x*(x!='not classified'),Cat_annotation[orf])
Handle.write(line.rstrip()+","+",".join(Annotation)+"\n")
Handle.close()