forked from kausmees/prophaser
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathinterpolate_maps.py
150 lines (112 loc) · 3.51 KB
/
interpolate_maps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/python
# this script is based on the code in interpolate_maps.py this repository: https://github.com/joepickrell/1000-genomes-genetic-maps
import os
chr = 20
infilename = "/media/kristiina/My Passport/Data/1KGdata/source/markers.txt"
mapfilename = "/path_to_HapMap2_data/HapmapII_GRCh37_RecombinationHotspots/genetic_map_GRCh37_chr"+ str(chr)+".txt"
outfilename = "5_snps_interpolated_HapMap2_map_20_tmp"
infile = open(infilename)
mapfile = open(mapfilename)
outfile = open(outfilename,"w")
posin = list()
rsin = list()
mappos = list()
mapgpos = list()
# for the given list of marker positions
line = infile.readline()
while line:
line = line.strip().split()
pos = int(line[0])
rs = line[1]
posin.append(pos)
rsin.append(rs)
line = infile.readline()
line = mapfile.readline()
line = mapfile.readline()
while line:
line = line.strip().split()
if "OMNI" in mapfilename:
pos = int(line[0]) # 1000 Genomes OMNI map
elif "HapmapII" in mapfilename:
pos = int(line[1]) # HapMap2 map
elif "plink" in mapfilename:
pos = int(line[3]) # Beagle-supplied PLINK map
if "OMNI" in mapfilename:
gpos = float(line[2]) # 1000 Genomes OMNI map
elif "HapmapII" in mapfilename:
gpos = float(line[3]) # HapMap2 map
elif "plink" in mapfilename:
gpos = float(line[2]) # Beagle-supplied PLINK map
mappos.append(pos)
mapgpos.append(gpos)
line = mapfile.readline()
print("Length posin : " + str(len(posin)))
index1 = 0
index2 = 0
while index1 < len(posin):
pos = posin[index1]
rs = rsin[index1]
if pos == mappos[index2]:
#the 1000 Genomes site was genotyped as part of the map
print >> outfile, rs, pos, mapgpos[index2]
# print rs, pos, mapgpos[index2]
index1 = index1+1
elif pos < mappos[index2]:
#current position in interpolation before marker
if index2 ==0:
#before the first site in the map (genetic position = 0)
print >> outfile, rs, pos, mapgpos[index2]
index1 = index1+1
else:
#interpolate
prevg = mapgpos[index2-1]
prevpos = mappos[index2]
frac = (float(pos)-float(mappos[index2-1]))/ (float(mappos[index2]) - float(mappos[index2-1]))
tmpg = prevg + frac* (mapgpos[index2]-prevg)
print >> outfile, rs, pos, tmpg
# print rs, pos, tmpg
index1 = index1+1
elif pos > mappos[index2]:
#current position in interpolation after marker
if index2 == len(mappos)-1:
#after the last site in the map (genetic position = maximum in map, note could try to extrapolate based on rate instead)
print >> outfile, rs, pos, mapgpos[index2]
# print rs, pos, mapgpos[index2]
index1 = index1+1
else:
#increment the marker
index2 = index2+1
### fix 0s at start of map
zero_poss = []
with open(outfilename) as infile:
line = infile.readline()
line = line.strip().split()
dist = float(line[2])
pos = int(line[1])
while dist == 0.0:
zero_poss.append(pos)
line = infile.readline()
line = line.strip().split()
dist = float(line[2])
pos = int(line[1])
first_nonzero_pos = pos
first_nonzero_dist = dist
cM_per_b = first_nonzero_dist/first_nonzero_pos
outfile = open(outfilename[:-4], 'w')
with open(outfilename) as infile:
line = infile.readline()
outfile.write(line)
line = line.strip().split()
dist = float(line[2])
pos = int(line[1])
assert dist == 0.0
for line in infile:
line_split = line.strip().split()
dist = float(line_split[2])
pos = int(line_split[1])
if(dist == 0.0):
line_split[2] = str(pos*cM_per_b)
outfile.write(line_split[0] + " " + line_split[1] + " " + line_split[2] + "\n")
else:
outfile.write(line)
os.remove(outfilename)