megahit_graph.snake

#from Bio.SeqIO.FastaIO import SimpleFastaParser as SFP


rule create_fastg:
	input : "{group}/contigs/contigs.fa"
	output: temp("{group}/graph/contigs.fastg")
	log : "{group}/graph/fastg.log"
	conda : CONDA_ENV + "/megahit.yaml"
	singularity : "docker://quay.io/annacprice/megahit:1.2.9"
	# I need the kmer length, if I put this in params, snakemake will ask for the value to be clear before the start of the pipeline, which is not possible, since we need at least the assembly.
	# also, if you try to store the value inside kmer_len, it will fail due to set -o. 
	shell : """
	# kmer_len=$(cut -d "_" -f -1 {input} |grep -m1 k | sed -E 's/(>k)+//') 
	# echo $kmer_len
	megahit_toolkit contig2fastg $(cut -d "_" -f -1 {input} |grep -m1 k | sed -E 's/(>k)+//') {input} > {output} 2>{log}
	"""
		
rule create_gfa:
	input: "{file}.fastg"
	output: temp("{file}_bad_name.gfa")
	log : "{file}.log"
	conda : CONDA_ENV + "/bandage.yaml"
	singularity : "docker://quay.io/annacprice/bandage:0.8.1"
	shell : "Bandage reduce {input} {output} 2>{log}"

rule rename_gfa:
	input: gfa="{group}/graph/contigs_bad_name.gfa",
		fa="{group}/contigs/contigs.fa"
	output: "{group}/graph/contigs.gfa"
	conda : CONDA_ENV + "/pythonenv.yaml"
	singularity : "docker://quay.io/annacprice/pythonenv:3.9"
	shell : "{SCRIPTS}/Rename_gfa.py {input.gfa} {input.fa} {output}"

rule subgraph:
	input: gfa="{group}/graph/contigs.gfa",
		   annotation="{group}/annotation/contigs_{DB}_best_hits.tsv"
	output:"{group}/graph/contigs.{DB}.gfa"
	conda : CONDA_ENV + "/pythonenv.yaml"
	singularity : "docker://quay.io/annacprice/pythonenv:3.9"
	shell:"{SCRIPTS}/Graph_selection.py {input.annotation} {input.gfa} component -o {output}"

rule subgraph_contiguous:
	input: gfa="{group}/graph/{name}.gfa",
		   annotation="{group}/annotation/contigs_{DB}_best_hits.tsv"
	output:"{group}/graph/{name}.{DB}_contiguous.gfa"
	conda : CONDA_ENV + "/pythonenv.yaml"
	singularity : "docker://quay.io/annacprice/pythonenv:3.9"
	shell:"{SCRIPTS}/Graph_selection.py {input.annotation} {input.gfa} contiguous -o {output}"

rule orf_graph:
	input: gfa="{group}/graph/{name}.gfa",
		bed="{group}/annotation/orf.bed"
	output:"{group}/graph/{name}_ORF.gfa"
	conda : CONDA_ENV + "/pythonenv.yaml"
	singularity : "docker://quay.io/annacprice/pythonenv:3.9"
	shell:"{SCRIPTS}/Build_ORF_graph.py {input.bed} {input.gfa} {output}"

rule graph_annotation:
	input: gfa="{group}/graph/{name}.gfa",
		List_annotation=expand("{group}/annotation/contigs_{annotation}_best_hits.tsv", annotation=DIAMOND,group=GROUPS)
	output:
		colors="{group}/graph/{name}.colors",
		graph_annotation=temp("{group}/graph/{name}_annotation.tmp")
	run:
		Handle=open(output["colors"],"w")
		Handle.write('\n'.join(["\t".join([wildcards.group+"/annotation/contigs_"+db+".tsv",color]) for db,color in DB_COLORS.items()]))
		Handle.close()
		shell("{SCRIPTS}/Generate_Graph_metadata.py {input.gfa} {output.colors} {output.graph_annotation}")

# rule gfa_to_fasta:
# 	input: gfa="{name}_contiguous.gfa"
# 	output: contigs="{name}_contigs.fa"
# 	run :
# 		Handle=open(output["contigs"],"w")
# 		for line in open(input["gfa"]) :
# 			if line[0]=="S" :
# 				Splitline=line.split("\t")
# 				Handle.write(">"+Splitline[1]+"\n"+Splitline[2]+"\n")
# 		Handle.close()

# rule CAT_graph_annotation:
# 	input: contigs="{group}/graph/{name}_contigs.fa"
# 	output: ORF2LCA="{group}/graph/CAT/{name}.ORF2LCA.txt"
# 	params : Dir="{group}/graph/CAT/{name}"
# 	threads: 1000
# 	shell : """
# 	CAT contigs -c {input.contigs}  -d {CAT_DB} -t {CAT_DB} -n {threads} --out_prefix {params.Dir} --top 11 --I_know_what_Im_doing --force
# 	"""

# rule CAT_ORF_graph_annotation:
# 	input: ORF2LCA="{group}/graph/CAT/{name}.ORF2LCA.txt"
# 	output: annotation="{group}/graph/{name}.cat",
# 			done=touch("{group}/graph/{name}.taxadone")
# 	params : Dir="{group}/graph/CAT/{name}"
# 	threads: 1000
# 	shell : """
# 	CAT add_names -i {input.ORF2LCA} -o {output.annotation} -t {CAT_DB} --only_official
# 	"""

rule concatenate_annotation:
	input: graph_annotation="{group}/graph/{name}_contiguous_ORF_annotation.tmp"
	output: annotation="{group}/graph/{name}_contiguous_ORF_annotation.csv"
	params : Dir="{group}/graph/CAT/{name}"
	run :
		Handle=open(output["annotation"],"w")
		Cat_annotation={line.split('\t')[0]:line.rstrip().split('\t')[3:] for line in open(input["cat"])}
		Handle_graph=open(input["graph_annotation"])
		Handle.write(next(Handle_graph).rstrip()+","+",".join(Cat_annotation["# ORF"])+"\n")
		for line in Handle_graph :
			orf=line.split(",")[0]
			Annotation=["","","","","","",""]
			if orf in Cat_annotation :
				Annotation=map(lambda x:x*(x!='not classified'),Cat_annotation[orf])
			Handle.write(line.rstrip()+","+",".join(Annotation)+"\n")
		Handle.close()