-
Notifications
You must be signed in to change notification settings - Fork 0
/
encode_context.py
116 lines (95 loc) · 3.42 KB
/
encode_context.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import getopt
import sys
from tqdm import tqdm
import os
def options():
optlist, args = getopt.getopt(sys.argv[1:],'i:o:',
['infile=','outfile='])
for o, a in optlist:
if o == '-i' or o == '--infile':
infile = a
elif o == '-o' or o == '--outfile':
outfile = a
return infile, outfile
def rc(seq):
#find the reverse complement of a sequence
rc_seq=''
seq=list(seq)
seq.reverse()
rdic={'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
for base in seq:
rc_seq+=rdic[base]
return rc_seq
def make_fa_dic(infile):
#import dictionary of sequences from fasta file
#sequence is a string, each entry is chromosome
#for fast lookup of sequence context
fdic={}
chrom_filter=False
with open(infile, 'r') as f:
total_lines = sum(1 for line in f)
with open(infile, 'r') as f:
for line in tqdm(f, total=total_lines, desc="Importing fasta", leave=False):
line=line.rstrip()
if '>' in line and 'sequence' not in line and 'genome' not in line:
if '>' in line:
line=line.split(' ')
chrom='chr'+line[len(line)-1]
chrom_filter=True
fdic[chrom]=[]
elif '>' in line and 'sequence' in line or 'genome' in line:
chrom_filter=False
elif chrom_filter:
fdic[chrom].append(line.upper())
for chrom in fdic:
fdic[chrom]=''.join(fdic[chrom])
return fdic
def hexamer_context():
bases=['A','C','T','G']
trimers=[]
for i in range(4):
for j in range(4):
for k in range(4):
trimers.append(bases[i]+bases[j]+bases[k])
hexamers=[]
for i in range(len(trimers)):
for j in range(len(trimers)):
hexamers.append(trimers[i]+'A'+trimers[j])
hexamers=dict(zip(hexamers,range(len(hexamers))))
hexamers_all=hexamers.copy()
for hexamer in hexamers:
hexamers_all[rc(hexamer)]=hexamers[hexamer]
return hexamers_all
def encode_context(fa_dic, hexamers_all, outfile):
context_dic={}
cd=False
with tqdm(total=len(fa_dic), desc="Encoding chromosome") as pbar:
for chrom in fa_dic:
pbar.set_description(f"Encoding {chrom}")
tmp=[4096]*3
for pos in range(3,len(fa_dic[chrom])-3):
hexamer=fa_dic[chrom][pos-3:pos+3+1]
if hexamer in hexamers_all:
tmp.append(hexamers_all[hexamer])
else:
tmp.append(4096)
for i in range(3):
tmp.append(4096)
context_dic[chrom]=dict(zip(range(1,len(tmp)+1), tmp))
context_df=pd.DataFrame(context_dic)
context_dic={}
context_df=context_df.astype(float)
context_df.to_hdf(outfile, chrom, format='table')
context_df=''
pbar.update(1)
infile, outfile = options()
#check if context already exists and if so delete it
if os.path.exists(outfile):
os.remove(outfile)
fa_dic = make_fa_dic(infile)
hexamers = hexamer_context()
encode_context(fa_dic, hexamers, outfile)