-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
156 lines (136 loc) · 7.05 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
'''
Various methods to analyse the SNP-Data
@author: Harald
'''
from Data import Data # @UnusedImport
from FlaggedSNPs import FlaggedSNPs
from extract_data import Extractor
import numpy as np # @UnusedImport
import matplotlib.pyplot as plt
import cPickle as pickle # @UnusedImport
# path, snP_inds = '../Data/SNP_data.csv', [2, 181] # Where to load from
# path, snP_inds ='../Data/SNPsModelCourse.csv', [1, 4 * 26 + 14]
# path, snP_inds = '../Data/HZ2014.csv', [3, 9 * 26 + 13]
# Where to save the pickle file to
# path1 = '../Data/SNP_data.p' # This is the SNP-data based on the cline-dataset
# path1 = '../Data/SNP_data2013.p' # This is the SNP-data based on the 2013 data-set
path1 = '../Data/SNP_data2014.p' # SNP-data from 2014
min_maf = 0.1 # Defines the threshold for IBD-analysis
p_val = 0.0001 # Defines the critical p-Value for flagging loci according to HW 0.0005
r2_score = 0.03 # Max r2 for LD-Flagging
max_failed_snps = 8 # Max Nr of failed SNPs per individual
sigma = 3000 # Kernel width used for smoothing
# subregion = [-300, 500, -150, 150] # Core Hybrid Zone
# subregion = [-1084, -300, 840, 1075] # Yellow Flank 1
# subregion = [285, 1200, 200, 480] # Magenta Flank 1
# subregion=[-20000,-8000,-10000,10000] #Over the pass
# subregion = [-50000, 100000, -100000, 11000] # Everything
subregion = [-3000, 2000, -10000, 1100] # Big Core
#subregion = [-2000,1500, -10000, 1100] # Slight less than Big Core
# subregion = [-4200, 3000, -10000, 2000] # Very Big Core
# subregion=[-2000,-500,-10000,1100] # Core left half
# subregion=[500,2000,-10000,1100] # Core right half
# subregion=[-500,500,-10000,600] # Vertical core
def main():
print("Welcome back buddy! \n Loading data...")
#data = Data(path, snP_inds) # Load original data!! (UNCOMMENT)
data = pickle.load(open(path1, "rb")) # Load data from pickle file!
#data.data_p_mean = np.array([2.0 * data.p for _ in data.data[:, 0]]) # Create Default p_mean matrix NORMALLY DONE IN CONSTRUCTOR - HOTFIX
data.set_subdata(-100000, 100000, -100000, 100000) # Set everything as subdata
flagged_SNPs = FlaggedSNPs(data) # Load default flagged SNPs
flagged_SNPs.maf = min_maf # A bit of a workaround to set the MAF
print("Nr of Columns: %i" % len(data.data[0, :]))
while True:
inp = int(input("\n What you wanna do? \n(1) Print geographic data \n(2) Extract sub-data \n(3) Data Cleaning \n(4) Data Analysis "
"\n(5) Data Extraction \n(6) Exit\n"))
if inp == 1:
flagged_SNPs.plot_geography()
elif inp == 2:
data.set_subdata(subregion[0], subregion[1], subregion[2], subregion[3]) # Extract subdata
flagged_SNPs = FlaggedSNPs(data) # Update flagged SNPs
data.save_p()
# data.add_subdata(subregion1[0], subregion1[1], subregion1[2], subregion1[3])
# data.add_subdata(subregion2[0], subregion2[1], subregion2[2], subregion2[3])
print("Nr of subsamples %.2f:" % len(data.subdata[:, 0]))
plt.figure()
plt.scatter(data.subdata[:, data.x_cords_ind].astype(float), data.subdata[:, data.y_cords_ind].astype(float), alpha=0.5)
plt.axis("equal")
plt.grid()
plt.show()
elif inp == 3:
while True:
inp1 = int(input("\nWhat do you want to do?\n(1) Flag failed SNPs \n(2) Delete doubles \n(3) Update allele Frequencies "
"\n(4) Detect faulty individuals \n(5) LD-Check \n(6) Save Data "
"\n(7) Color Correlation \n(8) Kernel Estimation \n(9) Go back\n"))
if inp1 == 1:
data.SNP_cleaning(min_maf, p_val)
if inp1 == 2:
data.double_elemination()
if inp1 == 3:
data.update_p()
if inp1 == 4:
data.faulty_inds(max_failed_snps)
if inp1 == 5:
data.ld_check(r2_score)
if inp1 == 6:
data.save_data(path1)
# data.data = data.subdata # Save subdata
# data.subdata = []
# pickle.dump(data, open(path1, "wb")) # Pickle the data
if inp1 == 7:
data.color_correlation(r2_score)
if inp1 == 8:
data.kernel_estimation(sigma)
data.kernel_estimation_rare(sigma=sigma)
if inp1 == 9:
flagged_SNPs = FlaggedSNPs(data)
break
elif inp == 4:
while True:
inp2 = int(input("\n(1) SNP-analysis \n(2) IBD-analysis \n(3) Y-M comparison \n(4) PC-Analysis \n(5) Permute colors "
"\n(6) Local Covariances \n(7) Geographic Comparison \n(8) Extract Coancestry-Txts \n(9) Back\n"))
if inp2 == 1:
data.plot_SNP_slider()
if inp2 == 2:
geo_scale = int(input("\nOn what geographic scale? (m)\n"))
flagged_SNPs.analyze_correlations(geo_scale)
if inp2 == 3:
geo_scale = int(input("\nOn what geographic scale? (m)\n"))
flagged_SNPs.ym_comparison(geo_scale)
if inp2 == 4:
flagged_SNPs.principal_components()
if inp2 == 5:
# data.compare_pass_p() # Secret option
flagged_SNPs.permute_colors()
if inp2 == 6:
# flagged_SNPs.homozygosity_analysis()
# flagged_SNPs.nbh_analysis(nbh_dist=75, min_nbh=5)
flagged_SNPs.loc_all_freq_covariances()
flagged_SNPs.x_dist_cov()
if inp2 == 7: # Bonus Line
geo_scale = int(input("\nOn what geographic scale? (m)\n"))
flagged_SNPs.geo_comparison(geo_scale)
if inp2 == 8:
flagged_SNPs.save_coancestry_data()
if inp2 == 9:
break
elif inp == 5:
extractor = Extractor(data)
while True:
inp2 = int(input("\n(1) Extract Coancestry-Txts\n(2) Extract simulated related pairs"
"\n(3) Extract highly related pairs\n(4) Extract Data for Alex \n(5) Back \n"))
if inp2 == 1:
extractor.save_coancestry_data()
elif inp2 == 2:
extractor.extract_pairs(200, 200, 200, 200) # nr PO, fullsibs, half_sibs, unrelated
elif inp2 == 3:
extractor.extract_high_rel_pairs()
elif inp2 == 4:
extractor.extract_snp_data()
elif inp2 == 5:
break
elif inp == 6:
print("See u!!")
break
if __name__ == '__main__':
main()