-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnovpergene.py
46 lines (42 loc) · 2.08 KB
/
novpergene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# 首先加载gene_trans_map.txt中每个基因的所有isoform
gene_to_all_isoforms = {}
with open("gene_trans_map.txt", "r") as f:
for line in f:
parts = line.strip().split()
gene = parts[0]
isoform = parts[1]
if gene not in gene_to_all_isoforms:
gene_to_all_isoforms[gene] = set()
gene_to_all_isoforms[gene].add(isoform)
# 接下来,统计ovule_3dpa_filter.txt中每个基因实际表达的isoform数量
gene_to_expressed_isoforms = {}
with open("ovule_3dpa_filter.txt", "r") as f:
for line in f:
parts = line.strip().split()
if len(parts) < 4 or not parts[0].startswith("MSTRG"):
continue
isoform_id = parts[0]
gene_id = isoform_id.rsplit('.', 1)[0]
if gene_id not in gene_to_expressed_isoforms:
gene_to_expressed_isoforms[gene_id] = set()
gene_to_expressed_isoforms[gene_id].add(isoform_id)
# 现在计算每个isoform在其基因中的占比,并输出到新文件中
with open("ovule_3dpa_filter.txt", "r") as infile, open("output_with_expressed_percentage.txt", "w") as outfile:
for line in infile:
parts = line.strip().split()
if len(parts) < 4:
# 可以选择输出标题行,如果它存在
outfile.write(line.strip() + "\n")
continue
isoform_id = parts[0]
gene_id = isoform_id.rsplit('.', 1)[0]
# 确认isoform_id是从gene_trans_map.txt中统计的MSTRG开始的isoform
if gene_id in gene_to_all_isoforms and isoform_id in gene_to_all_isoforms[gene_id]:
# 使用gene_id获取实际表达的isoform的数量
expressed_count = len(gene_to_expressed_isoforms.get(gene_id, []))
total_count = len(gene_to_all_isoforms[gene_id])
percentage = 100.0 / total_count if expressed_count > 0 else 0
outfile.write(line.strip() + f"\t{percentage:.2f}%\n")
else:
# 如果isoform没有在ovule_3dpa_filter.txt中表达或者不是MSTRG开始的,占比为0%
outfile.write(line.strip() + "\t0.00%\n")