-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbam_file_processing.py
70 lines (65 loc) · 3.4 KB
/
bam_file_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""A script to process bam files generated by MosaikAligner,
removing the unmapped reads, sorting, optionally replacing read groups and merging
enriched and non-enriched samples, merging all into one, and indexing the large bam
Provide a parent folder that contains folders with mosaik alignments & no other contents,
include the trailing slash.
"""
import os
import subprocess
import time
import argparse
##time.sleep(86400)
parser = argparse.ArgumentParser()
parser.add_argument("parent_folder", help="parent folder with subfolders containing alignments")
parser.add_argument("--merge", action="store_true", help="merge files from same samples")
args = parser.parse_args()
mosdir = args.parent_folder
subfolders = [s for s in os.listdir(mosdir) if s[0] != '.']
os.mkdir(mosdir + 'proc_bams')
bamdir = mosdir + 'proc_bams/'
for folder in subfolders:
mergelist = []
files = os.listdir(mosdir + folder)
for f in files:
if f[-4:] == '.bam':
file1 = mosdir + folder + '/' + f
file2 = bamdir + f[:-4] + '.bam'
command1 = "~/samtools-0.1.16/samtools view -b -F 4 %s > %s" % (file1, file2)
subprocess.call(command1, shell=True)
prefix3 = bamdir + f[:-4] + '_sorted'
command2 = "~/samtools-0.1.16/samtools sort %s %s" % (file2, prefix3)
subprocess.call(command2, shell=True)
mergelist.append(prefix3 + '.bam')
if args.merge:
rgcommand = ['java', '-jar', '-Xmx4g', \
'/Users/cliffbeall/picard-tools-1.112/picard-tools-1.112/AddOrReplaceReadGroups.jar']
rginfo1 = ['RGID=MERGE1', 'RGLB=CB0001/21', 'RGSM=AB.BS.10', \
'RGPL=illumina', 'RGPI=170', 'RGPU=ATCACG/GTTTCG', \
'VALIDATION_STRINGENCY=SILENT']
rginfo2 = ['RGID=MERGE2', 'RGLB=CB0002/16', 'RGSM=AB.BS.3', \
'RGPL=illumina', 'RGPI=170', 'RGPU=CGATGT/CCGTCC', \
'VALIDATION_STRINGENCY=SILENT']
rginfo3 = ['RGID=MERGE3', 'RGLB=CB0003/14', 'RGSM=AA.MS.10', \
'RGPL=illumina', 'RGPI=170', 'RGPU=AGTTCCC/CCGTCC', \
'VALIDATION_STRINGENCY=SILENT']
rginfos = [rginfo1, rginfo2, rginfo3]
prefixes = ['AB_BS_10_', 'AB_BS_3_', 'AA_MS_10_']
files1 = [f for f in mergelist if ('CB0001' in f or 'CB0002' in f or 'CB0003' in f)]
files2 = [f for f in reversed(mergelist) if (('CB0014' in f or 'CB0016' in f or 'CB0021' in f))]
for (f1, f2, rginfo, prefix) in zip(files1, files2, rginfos, prefixes):
sub1 = bamdir + prefix + folder + 'part1.bam'
subprocess.call(rgcommand + ['INPUT=' + f1, 'OUTPUT=' + sub1] + rginfo)
sub2 = bamdir + prefix + folder + 'part2.bam'
subprocess.call(rgcommand + ['INPUT=' + f2, 'OUTPUT=' + sub2] + rginfo)
sub12 = bamdir + prefix + folder + '_combined.bam'
mcommand = '/Users/cliffbeall/samtools-1.0/samtools merge -c -p ' + \
' '.join([sub12, sub1, sub2])
subprocess.call(mcommand, shell=True)
mergelist.remove(f1)
mergelist.remove(f2)
mergelist.append(sub12)
mergedbam = "%sproc_bams/%s_merged.bam" % (mosdir, folder)
command3 = "~/samtools-1.0/samtools merge " + mergedbam + " " + ' '.join(mergelist)
subprocess.call(command3, shell=True)
command4 = "~/samtools-1.0/samtools index " + mergedbam
subprocess.call(command4, shell=True)