-
Notifications
You must be signed in to change notification settings - Fork 0
/
chromosome_names_in_files.py
149 lines (103 loc) · 6.23 KB
/
chromosome_names_in_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
"""
#%%
def chromosome_name_bedfile(bed_file=None):
'''This function returns some properties of the chromosomes in a bed file.
Input can be either of two options:
The full path to a bed file after which the program opens the bed file, or
a list of the lines in the bed file. The latter requires to read the bed file before calling this function and input all lines in the bed file as a list. The function than does not open the bed file again.
Returns three dictionaries (in this order):
The first indicates the names of the chromosomes as used in the bed file (keys are roman numerals 1 to 16 and the values are the names used in the bed file).
The second is the start line in the bed file of each chromosome (keys are the roman numerals of the chromosome names and the values are the start lines in the bed file of the chromosome).
The third is the end line in the bed file of each chromosome (keys are the roman numerals of the chromosome names and the values are the start lines in the bed file of the chromosome)
CHANGE LINE 60 AND 71 TO AUTOMATICALLY RECOGNIZE THE MITOCHONDRIAL DNA NAME
'''
if bed_file == None:
import os
file_dirname = os.path.dirname(os.path.abspath('__file__'))
bed_file = os.path.join(file_dirname,'..','..','satay_analysis_testdata','Output_Processing','Cerevisiae_WT2_Michel2017_trimmed1.bam.bed')
if type(bed_file) is str:
with open(bed_file) as f:
lines = f.readlines()
elif type(bed_file) is list:
lines = bed_file
num_arabic = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]
num_roman = ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI']
roman_to_arabic_dict = {}
index_counter = 0
for rom in num_roman:
roman_to_arabic_dict[rom] = num_arabic[index_counter]
index_counter += 1
chromosome_romannames_list = []
for roman in roman_to_arabic_dict:
chromosome_romannames_list.append(roman)
chrom_names_dict = {}
chrom_start_line_dict = {}
chrom_end_line_dict = {}
chrom_name_in_bed = ''
chr_counter = 0
line_counter = 0
stop_loop = False
while stop_loop is False:
line = lines[line_counter]
chrom_name_current = line.split(' ')[0].replace('chr','')
if not chrom_name_current.startswith('track') and not chrom_name_current.startswith('M') and not chrom_name_current.startswith('ref|NC_001224|') and chr_counter < 16: #SKIP HEADER AND MITOCHRONDRIAL CHROMOSOMES #and not chrom_name_current.startswith('M')
if chrom_name_current != chrom_name_in_bed:
chrom_names_dict[chromosome_romannames_list[chr_counter]] = chrom_name_current
chrom_name_in_bed = chrom_name_current
# print('Chromosome ',chromosome_romannames_list[chr_counter], 'is ',chrom_name_current)
chrom_start_line_dict[chromosome_romannames_list[chr_counter]] = line_counter #GET START INDEX IN THE BED FILE OF THE CURENT CHROMOSOME
if chr_counter != 0:
chrom_end_line_dict[chromosome_romannames_list[chr_counter-1]] = line_counter-1 #GET THE END INDEX IN THE BED OF THE PREVIOUS CHROMOSOME (SKIP FOR THE FIRST CHROMOSOME)
chr_counter += 1
elif chrom_name_current.startswith('M') or chrom_name_current.startswith('ref|NC_001224|'):
chrom_end_line_dict[chromosome_romannames_list[-1]] = line_counter-1 #GET THE END INDEX IN THE BED FILE FOR THE FINAL CHROMOSOME
stop_loop = True
line_counter += 1
return(chrom_names_dict, chrom_start_line_dict, chrom_end_line_dict)
#%%
def chromosome_name_wigfile(wig_file=None):
if wig_file == None:
import os
file_dirname = os.path.dirname(os.path.abspath('__file__'))
wig_file = os.path.join(file_dirname,'..','..','satay_analysis_testdata','Output_Processing','Cerevisiae_WT2_Michel2017_trimmed1.bam.wig')
if type(wig_file) is str:
with open(wig_file) as f:
lines = f.readlines()
elif type(wig_file) is list:
lines = wig_file
num_arabic = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
num_roman = ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI','M']
roman_to_arabic_dict = {}
index_counter = 0
for rom in num_roman:
roman_to_arabic_dict[rom] = num_arabic[index_counter]
index_counter += 1
chromosome_romannames_list = []
for roman in roman_to_arabic_dict:
chromosome_romannames_list.append(roman)
chrom_names_dict = {}
chrom_start_line_dict = {}
chrom_end_line_dict = {}
chr_counter = 0
line_counter = 0
chrom_line = 'variablestep'
for line in lines:
line.strip('\n')
line_split = line.split(' ')
if line_split[0].lower() == chrom_line and chr_counter < 17:
chromosome_name = line_split[1].replace('chrom=chr','').strip('\n')
chrom_names_dict[chromosome_romannames_list[chr_counter]] = chromosome_name
# print('Chromosome ',chromosome_romannames_list[chr_counter], 'is ',chromosome_name)
chrom_start_line_dict[chromosome_romannames_list[chr_counter]] = line_counter+1 #GET START INDEX IN THE BED FILE OF THE CURENT CHROMOSOME
if chr_counter != 0:
chrom_end_line_dict[chromosome_romannames_list[chr_counter-1]] = line_counter #GET THE END INDEX IN THE BED OF THE PREVIOUS CHROMOSOME (SKIP FOR THE FIRST CHROMOSOME)
chr_counter += 1
line_counter += 1
chrom_end_line_dict[chromosome_romannames_list[chr_counter-1]] = len(lines)
return(chrom_names_dict, chrom_start_line_dict, chrom_end_line_dict)
#%%
if __name__ == '__main__':
chromosome_name_bedfile(bed_file=r"C:\Users\gregoryvanbeek\Documents\testing_site\wt1_testfolder_w303_LYZE01_1\align_out\ERR1533147_trimmed.sorted.bam.bed")
# chromosome_name_wigfile(wig_file=r"C:\Users\gregoryvanbeek\Desktop\Python_matlab_differences\E-MTAB-4885_WT2\E-MTAB-4885.WT2.bam_python.wig")