-
Notifications
You must be signed in to change notification settings - Fork 2
/
construct_vcf_from_multifasta.py
executable file
·96 lines (83 loc) · 3.98 KB
/
construct_vcf_from_multifasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/python
import sys
import os
import random
import argparse
from os import listdir
from os.path import isfile, join
#also cleanups header from spaces and some other bad chars
def split_fasta(filename, outputdir):
if not os.path.isdir(outputdir):
os.mkdir(outputdir)
outFile = None
for line in open(filename):
if line[0] == '>':
if outFile:
outFile.close()
line = line.replace(" ","_").replace("|","_").replace("/","_")
outFile = open(os.path.join(outputdir, line[1:].strip() + '.fa'), 'w')
if outFile:
outFile.write(line)
if outFile: # if filename is empty
outFile.close()
def parse_args(args):
###### Command Line Argument Parser
parser = argparse.ArgumentParser(description="Script for construction of large vcf file from GISAID SARS-CoV2 DB.")
parser._action_groups.pop()
required_args = parser.add_argument_group('required arguments')
required_args.add_argument('--ref', required = True, help='Path to reference (NC_045512.2 aka MN908947) file')
required_args.add_argument('--multifasta', required = True, help='Path to multifasta with SARS-CoV2 strains')
required_args.add_argument('--out', required = True, help='Path to output file')
optional_args = parser.add_argument_group('optional arguments')
optional_args.add_argument('--minimap2', help='Path to minimap2 distribution')
optional_args.add_argument('--picard', help='Path to a folder with picard https://broadinstitute.github.io/picard/ script')
optional_args.add_argument('--k8', help='Path to k8 binary, required to run paftools.js from minimap2 distribution')
if len(sys.argv)==1:
parser.print_help(sys.stderr)
sys.exit(1)
return parser.parse_args()
def construct_vcf(args):
minimap2 = ""
picard = ""
k8 = ""
ref = args.ref
merged = args.multifasta
outfile = args.out
if args.minimap2:
minimap2= args.minimap2
if args.k8:
k8=args.k8
if args.picard:
picard = args.picard
splitted = "splitted"
vcfs = "vcfs"
if not os.path.isdir(splitted):
os.mkdir(splitted)
for f in listdir(splitted):
os.remove(os.path.join(splitted, f))
if not os.path.isdir(vcfs):
os.mkdir(vcfs)
for f in listdir(vcfs):
os.remove(os.path.join(vcfs, f))
split_fasta (merged, splitted)
for f in listdir(splitted):
l = (os.path.join(minimap2, "minimap2") + " -c --cs " + ref + " " + os.path.join(splitted, f) + " 2>/dev/null | sort -k6,6 -k8,8n | " + k8 + " " + os.path.join(minimap2, "misc","paftools.js") + " call -L20000 -f " + ref + " - > " + os.path.join(vcfs, os.path.basename(f) + ".vcf"))
paf_file = os.path.join(splitted, os.path.basename(f) +".paf")
minimap2_str = os.path.join(minimap2, "minimap2") + " -c --cs " + ref + " " + os.path.join(splitted, f) +" > " +paf_file + " 2>/dev/null"
# print minimap2_str
paftools_str = "sort -k6,6 -k8,8n " + paf_file +" | " + k8 + " " + os.path.join(minimap2, "misc","paftools.js") + " call -L20000 -f " + ref + " - > " + os.path.join(vcfs, os.path.basename(f) + ".vcf")
os.system(minimap2_str)
os.system(paftools_str)
# os.system (os.path.join(minimap2, "minimap2") + " -c --cs " + ref + " " + os.path.join(splitted, f) + " 2>/dev/null | sort -k6,6 -k8,8n | " + k8 + " " + os.path.join(minimap2, "misc","paftools.js") + " call -L20000 -f " + ref + " - > " + os.path.join(vcfs, os.path.basename(f) + ".vcf"))
vcflist = open("vcfs.list", "w")
for f in listdir(vcfs):
if f.find("_bat_")== -1 and f.find("pangolin") == -1:
vcflist.write(os.path.join(vcfs, f) + "\n")
vcflist.close()
os.system("java -jar " + os.path.join(picard, "picard.jar") + " MergeVcfs I=vcfs.list O=" + outfile)
#if len(sys.argv) != 3:
# print "Usage: " + sys.argv[0] + " ref multifasta "
# exit()
if __name__ == "__main__":
args = parse_args(sys.argv[1:])
construct_vcf(args)