-
Notifications
You must be signed in to change notification settings - Fork 0
/
FlaggedSNPs.py
816 lines (660 loc) · 40.2 KB
/
FlaggedSNPs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
'''
Created on May 15, 2015
@author: Harald Ringbauer
'''
import numpy as np
import matplotlib.pyplot as plt
# import matplotlib.colors
from scipy.stats import norm
from scipy.stats import binned_statistic
from scipy.misc import factorial # @UnresolvedImport
import itertools
from time import time
from random import choice
from collections import Counter
from scipy.optimize import curve_fit
from pyproj import Proj # @UnresolvedImport
#from colorpins import extract_colored_kml
draw_parent = False # Whether simulated data is shown in correlogram
min_relatedness = 0.2 # Kinship threshold for PO detection
max_opp_homos = 1 # Maximal number of opposing homos for P0 detection (strictly <)
error_rate = 0.05 # Estimated error rate for SNP genotyping
lat_center = 42.32271733
long_center = 2.074444088
class FlaggedSNPs(object):
'''
Class for Flagged SNPs and other relevant Data. Has methods to analyze the data
'''
data, p, coords, names, color, id, maf = 0, 0, 0, 0, 0, 0, 0
po_list = [] # List of Parent-Offspring coordinates
def __init__(self, Data):
'''Initialize '''
good_SNPs = np.where(Data.sNP_okay == 1)[0]
self.data = Data.subdata[:, good_SNPs].astype('float') # Load data from Subdata (cut out the good snps)
self.data_p_mean = Data.data_p_mean[:, good_SNPs - Data.sNP_inds[0]] # Load matrix of estimated mean allele frequencies (good snps already cut out)
self.p = Data.p[good_SNPs - Data.sNP_inds[0]].astype('float') # Load allele frequencies
# self.coords = Data.subdata[:, Data.x_cords_ind:Data.y_cords_ind + 1].astype('float')
self.coords = Data.coords
self.names = Data.header[good_SNPs]
self.color = Data.subdata[:, Data.color_ind] # What is the color ind
self.id = Data.subdata[:, Data.id_ind] # Assumes id-field is the first
self.set_colors()
print("Analyzing %i SNPs from %i individuals" % (len(self.data[0, :]), len(self.data[:, 0])))
def set_colors(self):
'''Set the color vector'''
color = np.array(['grey' for _ in range(0, len(self.color))]).astype('|S16')
color[self.color == 'Y'] = 'y'
color[self.color == 'FR'] = 'm'
color[self.color == 'W'] = 'azure'
color[self.color == 'FO'] = 'darkorange'
color[self.color == 'WO'] = "sandybrown"
color[self.color == 'WR'] = "lightsalmon"
self.color = color
def xy_to_gps(self, delta_x, delta_y):
'''Extract GPS data relative to center
Takes vector as input and outputs vector - Return lat and long'''
my_proj = Proj(proj='utm', zone="31T",
ellps='WGS84', units='m') # Prepare the Longitude/Latidude to Easting/Northing transformer
center_x, center_y = my_proj(long_center, lat_center) # Transform to Easting Northing
longs, lats = [], []
for i in range(len(delta_x)): # Iterate
lon, lat = my_proj(center_x + delta_x[i], center_y + delta_y[i], inverse=True)[:2]
longs.append(lon)
lats.append(lat)
return longs, lats
def principal_components(self):
'''Does a quick PC-analysis'''
data, p, coords = self.data, self.p, self.coords # Extract filtered SNPs.
# Impute missing genotypes
for (ind, sNP), value in np.ndenumerate(data): # Iterate over all data points
if value == -9: # In case of missing data
data[ind, sNP] = np.random.binomial(2, p[sNP]) # Draw random genotype for missing data
_, score, _ = princomp(data)
pca1_mean, bin_edges, _ = binned_statistic(coords[:, 0], score[0, :], bins=15, statistic='mean') # Calculate Bin Values
plt.figure()
plt.title("PCA")
plt.scatter(score[0, :], score[1, :], color=self.color, alpha=0.7)
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.axis('equal')
plt.show()
# Scatter-Plot of 1st principal component vrs distance
plt.figure()
plt.scatter(coords[:, 0], score[0, :], color=self.color, alpha=0.7)
plt.xlabel("x-Coordinate")
plt.ylabel("1st Principal Component")
plt.hlines(pca1_mean, bin_edges[:-1], bin_edges[1:], colors='g', lw=5, label='Binned data')
plt.axhline(0, label="Null line", color='k', linewidth=1)
plt.legend()
plt.show()
plt.scatter(coords[:, 0], score[1, :], color=self.color, alpha=0.7)
plt.xlabel("x-Coordinate")
plt.ylabel("2nd Principal Component")
plt.show()
# Plot Geography vrs first component
plt.scatter(coords[:, 0], coords[:, 1], c=score[0, :], alpha=0.4)
plt.show()
# Plot Geography vrs second component
plt.scatter(coords[:, 0], coords[:, 1], c=score[1, :], alpha=0.4)
plt.show()
# Some special code for filtering out what strange things are going on with PCA2:
# inds = np.where((score[1, :] > 2.5) * (coords[:, 0] > 0))[0] # Trololol
# inds = np.where((score[1, :] > 2.5) * (coords[:, 0] > 0))[0]
# print(inds)
# print(data[inds, :])
# print(self.id[inds])
# np.savetxt("special_inds.csv", data[inds, :], delimiter='$')
# self.color[inds] = "k"
print("Extracting Google Maps file")
longs, lats = self.xy_to_gps(coords[:, 0], coords[:, 1])
#extract_colored_kml(score[0, :], lats, longs, self.id, "pca.kml")
#extract_colored_kml(score[1, :], lats, longs, self.id, "pca2.kml") # Second PCA axis
print("Extraction complete")
def analyze_correlations(self, max_dist):
''' Method which analyzes allele correlations for every sample pair and creates plots'''
# Preprocessing of SNPs
print("Extract SNPs...")
data, p, coords = self.data, self.p, self.coords # Extract filtered SNPs.
p_mean = self.data_p_mean
nr_inds = len(data[:, 0])
n = nr_inds * (nr_inds - 1) / 2 # Number of all pairwise comparisons
prelim_results = np.zeros(n)
homos = np.zeros(n)
pair_distance = np.zeros(n)
# Randomly permute geographic labels:
# i=np.random.permutation(np.size(coords,0))
# coords=coords[i,:]
print("Analysing %i suitable SNPs. " % len(data[0, :]))
# Do the correlation analysis:
dist_mat = calc_distance_matrix(coords) # Calculate the distance matrix
t = time()
for (i, j) in itertools.combinations(np.arange(np.size(data, 0)), r=2):
ind = j * (j - 1) / 2 + i # Index in a linearized array
# estimator, homo = self.kinship_coeff(data[i, :], data[j, :], p) # Kinship coeff per pair, averaged over loci + Number of opposing homozygotes
estimator, homo = self.kinship_coeff_loc_freq(data[i, :], data[j, :], p_mean[i, :], p_mean[j, :]) # Kinship coeff. p.p, averaged over loci + Number of opp. homos
prelim_results[ind] = estimator
homos[ind] = homo
# Calculate Pairwise distance
pair_distance[ind] = dist_mat[i, j] # Load the distance matrix
# Append Lines of detected PO to P0-List
if estimator > min_relatedness and homo < max_opp_homos:
self.po_list.append([coords[i, :], coords[j, :]])
print("Elapsed Time: %2f" % (time() - t))
# Quicker, vectorized way of doing calculations
# t= time()
# f_ufunc = np.frompyfunc(self.kinship_coeff_loc_freq, 4, 2) # 4 inputs, 2 output
# estimator, homo = f_ufunc(data[:,None],data, p_mean[:,None],p_mean)
# dist_mat,estimator,homo = dist_mat.flatten(), estimator.flatten(), homo.flatten()
#
#
# print("Elapsed Time: %2f" % (time() - t))
# print(prelim_results[:10]-estimator[:10])
# Create some artificial offsprings and measure their value with their parents. Don't forget to delete.
parent1 = np.random.randint(len(data[:, 0]), size=1000)
parent2 = np.random.randint(len(data[:, 0]), size=1000)
distance_parents = [np.linalg.norm(coords[parent1[i], :] - coords[parent2[i], :]) for i in range(0, len(parent1))] # Pairwise distance of parents
offspring = [self.create_offspring(data[parent1[i], :], data[parent2[i], :]) for i in range(0, len(parent1))] # Create Offspring
# Now calculate relationship between parent1 and offspring
kinship = []
for i in range(0, len(parent1)):
par1 = data[parent1[i], :] # Extract all SNPs
kid = offspring[i]
estimator, _ = self.kinship_coeff(par1, kid, p)
kinship.append(np.mean(estimator))
mean_het = np.mean(prelim_results)
distance_mean, bin_edges, _ = binned_statistic(pair_distance, prelim_results, bins=20, statistic='mean', range=[0, max_dist]) # Calculate Bin Values
# Histogram of heterozygosity
gaussian_params = fit_gaussian(prelim_results)
print("Mean kinship coefficient: %.6f" % mean_het)
print("STD of kinship coefficients: %.6f" % gaussian_params[1])
print("Mean kinship of simulated PO-pairs: %.4f" % np.mean(kinship))
x = np.linspace(min(prelim_results), max(prelim_results), 10000)
pdf_fitted = norm.pdf(x, loc=gaussian_params[0], scale=gaussian_params[1])
plt.figure()
plt.hist(prelim_results, bins=50, label='Data', color='g', alpha=0.6)
plt.plot(x, pdf_fitted * len(prelim_results) * (max(prelim_results) - min(prelim_results)) / 50.0, 'r-', label='Gaussian Fit', linewidth=2)
plt.legend()
plt.xlabel("Estimated kinship coefficient per pair")
plt.ylabel("Count")
plt.show()
# Scatter-plot of heterozygosity vrs. distance
homos = np.array(homos)
colors = np.array(['r' for i in range(0, len(homos))]) # Adjust color
colors[homos == 0] = 'b'
colors[homos == 1] = 'y'
plt.figure()
plt.scatter(pair_distance, prelim_results, alpha=0.5, c=colors, label="Pairwise Data")
plt.xlabel("Distance")
plt.ylabel("Estimated shared loci")
plt.hlines(distance_mean, bin_edges[:-1], bin_edges[1:], colors='g', lw=5, label='Binned data')
plt.xlim([0, max_dist])
plt.axhline(mean_het, label="Mean Value", color='k', linewidth=2)
plt.axhline(0.25, label="Expected P0/Siblings", color='y')
if draw_parent:
plt.scatter(distance_parents, kinship, alpha=1, c='k', label="Simulated PO-pairs")
plt.legend()
plt.show()
# Plot Correlations
bin_mean = (bin_edges[:-1] + bin_edges[1:]) / 2.0 # Calculate the mean of the bins
index = np.where(bin_mean > 50) # Only take distances bigger than sigma
C, k, std = fit_log_linear(bin_mean[index], distance_mean[index]) # Fit with exponential fit
Nb_est = 1 / (-k)
Nb_std = (-std / k) * Nb_est
x_plot = np.linspace(min(bin_mean), max(bin_mean), 10000)
# fit = [C * np.exp(r * t) for t in x]
print("Fit: \nC: %.4G \nk: %.4G" % (C, k))
plt.plot(bin_mean, distance_mean, 'ro', label="Estimated Correlation per bin")
plt.plot(x_plot, C + k * np.log(x_plot), 'g', label="Fitted decay")
plt.axhline(mean_het, label="Mean Value", color='k', linewidth=2)
plt.annotate(r'$\bar{N_b}=%.4G \pm %.2G$' % (Nb_est, Nb_std) , xy=(0.6, 0.7), xycoords='axes fraction', fontsize=25)
plt.legend()
plt.ylabel("Estimated Correlation")
plt.xlabel("Distance")
# plt.xscale('log')
plt.show()
print("Samples with no opposing homozygotes: %.1f: " % np.sum(homos == 0))
print("Samples with one opposing homozygote: %.1f: " % np.sum(homos == 1))
# Plot Opposing-Homozygotes distribution
counts = Counter(homos)
hits = np.array(counts.keys())
values = np.array(counts.values())
values = values / float(sum(values)) # To get probabilities
parameters, cov_matrix = curve_fit(poisson, hits, values, p0=np.mean(homos)) # @UnusedVariable
print("\nParameters %.2f\n" % parameters[0])
# plot poisson-deviation with fitted parameter
x_plot = np.linspace(0, hits.max(), 1000)
plt.figure()
plt.plot(x_plot, len(homos) * poisson(x_plot, *parameters), 'r-', lw=2, label="Poisson Fit")
plt.hist(homos, bins=np.linspace(-0.5, 59.5, 61), label="Histogram")
plt.plot(hits, values * len(homos), 'go', label="True data points")
plt.legend()
plt.title("Opposing homozygotes per pair")
plt.show()
x_plot = np.linspace(-0.5, 3, 500)
plt.figure()
plt.plot(x_plot, len(homos) * poisson(x_plot, *parameters), 'r-', lw=2, label="Poisson Fit")
plt.hist(homos, bins=np.linspace(-0.5, 3.5, 5), label="Histogram")
plt.plot(hits[:4], values[:4] * len(homos), 'go', label="True data points")
plt.legend()
plt.title("Opposing homozygotes per pair")
plt.show()
def ym_comparison(self, max_dist=1000, true_color=True):
'''Compares yellow, magenta and inbetween Kinship coefficients along distance classes'''
data, p_all, coords, color = self.data, self.p, self.coords, self.color # Extract filtered SNPs. @UnusedVariable
p_mean = self.data_p_mean
print("Analysing %.1f suitable SNPs. " % len(data[0, :]))
print("Allele Frequs:")
print(p_all)
# p_del=int(input("What SNP do you want to treat as color?"))
# Pretend color is some other SNP:
p_del_vec = np.arange(100) + 5 # Try out 100 random SNPs.
distance_mean_y = np.zeros(20)
distance_mean_m = np.zeros(20)
distance_mean_ym = np.zeros(20)
if true_color == True: # "Forget about the pretending"-variable.
p_del_vec = [-1]
for p_del in p_del_vec: # For every locus in p_del_vec do the deletion thingy
if p_del != -1: # Pretend SNP is "color"
color = self.data[:, p_del]
data = np.delete(self.data, (p_del), axis=1) # Delete this SNP from data to not bias kinship estimates
p_mean = np.delete(p_all, (p_del), axis=0)
else: p_mean = p_mean
print(np.mean(color[color > -1]) / 2.0)
y_comps = [] # Comparisons among yellow
m_comps = [] # Comparisons among magenta
ym_comps = [] # Comparisons inbetween yellow/magenta
for (i, j) in itertools.combinations(np.arange(np.size(data, 0)), r=2):
delta = coords[i, :] - coords[j, :] # Calculate Pairwise distance
pair_distance = np.linalg.norm(delta)
estimator, _ = self.kinship_coeff_loc_freq(data[i, :], data[j, :], p_mean[i, :], p_mean[j, :]) # Kinship coeff per pair, averaged over loci
if color[i] == 2 and color[j] == 2: # Case: Two magentas
# estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p) # Kinship coeff per pair, averaged over loci
y_comps.append([estimator, pair_distance]) # Append Kinship coefff and pairwise distance
if color[i] == 0 and color[j] == 0: # Case: Two yellows
# estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p)
m_comps.append([estimator, pair_distance])
if (color[i] == 0 and color[j] == 2) or (color[i] == 2 and color[j] == 0): # Case: One yellow, one magenta
# estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p)
ym_comps.append([estimator, pair_distance])
# if pair_distance>200 and pair_distance<300: # Mark Comparisons in right distance classe DELETE
# self.po_list.append([coords[i, :], coords[j, :]])
y_comps = np.array(y_comps) # Make everything a numpy array
m_comps = np.array(m_comps)
ym_comps = np.array(ym_comps)
print("Y-mean Kinship: %.6f" % np.mean(y_comps[:, 0]))
print("M-mean Kinship: %.6f" % np.mean(m_comps[:, 0]))
print("YM-mean Kinship: %.6f" % np.mean(ym_comps[:, 0]))
distance_mean_yt, bin_edges, _ = binned_statistic(y_comps[:, 1], y_comps[:, 0], bins=20, statistic='mean', range=[0, max_dist]) # Calculate Bin Values
distance_mean_mt, _, _ = binned_statistic(m_comps[:, 1], m_comps[:, 0], bins=20, statistic='mean', range=[0, max_dist]) # Calculate Bin Values
distance_mean_ymt, _, _ = binned_statistic(ym_comps[:, 1], ym_comps[:, 0], bins=20, statistic='mean', range=[0, max_dist]) # Calculate Bin Values
distance_mean_y += distance_mean_yt / float(len(p_del_vec))
distance_mean_m += distance_mean_mt / float(len(p_del_vec))
distance_mean_ym += distance_mean_ymt / float(len(p_del_vec))
bin_mean = (bin_edges[:-1] + bin_edges[1:]) / 2.0 # Calculate the mean of the bins
plt.figure()
plt.scatter(y_comps[:, 1], y_comps[:, 0], alpha=0.5, c='y', label="Yellow-Yellow")
plt.scatter(m_comps[:, 1], m_comps[:, 0], alpha=0.5, c='m', label="Magenta-Magenta")
plt.scatter(ym_comps[:, 1], ym_comps[:, 0], alpha=0.5, c='g', label="Yellow-Magenta")
plt.xlabel("Distance")
plt.ylabel("Estimated shared loci")
plt.xlim([0, max(bin_edges)])
plt.axhline(0.25, label="Expected P0/Siblings", color='y')
plt.legend()
plt.show()
plt.figure()
plt.plot(bin_mean, distance_mean_y, 'yo', label="Yellow-Yellow")
plt.plot(bin_mean, distance_mean_m, 'mo', label="Magenta-Magenta")
plt.plot(bin_mean, distance_mean_ym, 'go', label="Magenta-Yellow")
plt.legend()
plt.xlabel("Euclidean Distance")
plt.ylabel("Mean Kinship coefficient")
plt.show()
def geo_comparison(self, max_dist=1000):
'''Compares kinship coefficients based on geography'''
data, p, coords, color = self.data, self.p, self.coords, self.color # Extract filtered SNPs. @UnusedVariable
p_mean = self.data_p_mean
print("Analysing %.1f suitable SNPs. " % len(data[0, :]))
# p_del=int(input("What SNP do you want to treat as color?"))
# Pretend color is some other SNP:
y_comps = [] # Comparisons among yellow
m_comps = [] # Comparisons among magenta
h_comps = [] # Comparisons among hybrids
ym_comps = [] # Comparisons inbetween yellow/magenta
color = [1 for i in coords[:, 0]]
for i in range(0, len(color)): # Index everything via geographic position.
if coords[i, 0] < -250: # Set the color border (geographic)
color[i] = 0
elif coords[i, 0] > 400: # Set the right color border (geographic)
color[i] = 2
else: color[i] = 1
# p_y = 0.5 * np.array([np.mean([data[j, i] for j in range(0, len(color)) if color[j] == 0 and data[j, i] > -8]) for i in range(0, len(data[0, :]))])
# p_m = 0.5 * np.array([np.mean([data[j, i] for j in range(0, len(color)) if color[j] == 2 and data[j, i] > -8]) for i in range(0, len(data[0, :]))])
# p_h = 0.5 * np.array([np.mean([data[j, i] for j in range(0, len(color)) if color[j] == 1 and data[j, i] > -8]) for i in range(0, len(data[0, :]))])
for (i, j) in itertools.combinations(np.arange(np.size(data, 0)), r=2):
delta = coords[i, :] - coords[j, :] # Calculate Pairwise distance
pair_distance = np.linalg.norm(delta)
estimator, _ = self.kinship_coeff_loc_freq(data[i, :], data[j, :], p_mean[i, :], p_mean[j, :]) # Kinship coeff per pair, averaged over loci
# estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p) # Kinship coeff per pair, averaged over loci
if color[i] == 2 and color[j] == 2: # Case: Two magentas
# estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p) # Kinship coeff per pair, averaged over loci
m_comps.append([estimator, pair_distance]) # Append Kinship coefff and pairwise distance
elif color[i] == 0 and color[j] == 0: # Case: Two yellows
# estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p)
y_comps.append([estimator, pair_distance])
elif color[i] == 1 and color[j] == 1: # Case: Two hybrids
estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p)
h_comps.append([estimator, pair_distance])
elif (color[i] == 0 and color[j] == 2) or (color[i] == 2 and color[j] == 0): # Case: One yellow, one magenta
# estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p)
ym_comps.append([estimator, pair_distance])
# if pair_distance>200 and pair_distance<300: # Mark Comparisons in right distance classe DELETE
# self.po_list.append([coords[i, :], coords[j, :]])
y_comps = np.array(y_comps) # Make everything a numpy array
m_comps = np.array(m_comps)
ym_comps = np.array(ym_comps)
h_comps = np.array(h_comps)
print("Y-mean Kinship: %.6f" % np.mean(y_comps[:, 0]))
print("M-mean Kinship: %.6f" % np.mean(m_comps[:, 0]))
print("YM-mean Kinship: %.6f" % np.mean(ym_comps[:, 0]))
distance_mean_y, bin_edges, _ = binned_statistic(y_comps[:, 1], y_comps[:, 0], bins=15, statistic='mean', range=[0, 1500]) # Calculate Bin Values
distance_mean_m, bin_edges_m, _ = binned_statistic(m_comps[:, 1], m_comps[:, 0], bins=15, statistic='mean', range=[0, 1500]) # Calculate Bin Values
distance_mean_ym, bin_edges_ym, _ = binned_statistic(ym_comps[:, 1], ym_comps[:, 0], bins=9, statistic='mean', range=[800, max_dist]) # Calculate Bin Values
distance_mean_h, bin_edges_h, _ = binned_statistic(h_comps[:, 1], h_comps[:, 0], bins=6, statistic='mean', range=[0, 600]) # Calculate Bin Values
# Calculate the Variances
dist_std_y, _, _ = binned_statistic(y_comps[:, 1], y_comps[:, 0], bins=15, statistic=np.std, range=[0, 1500]) # Calculate Bin Values
dist_std_m, _, _ = binned_statistic(m_comps[:, 1], m_comps[:, 0], bins=15, statistic=np.std, range=[0, 1500]) # Calculate Bin Values
dist_std_h, _, _ = binned_statistic(h_comps[:, 1], h_comps[:, 0], bins=6, statistic=np.std, range=[0, 600]) # Calculate Bin Values
bin_mean = (bin_edges[:-1] + bin_edges[1:]) / 2.0 # Calculate the geographic mean of the bins
bin_mean_m = (bin_edges_m[:-1] + bin_edges_m[1:]) / 2.0
bin_mean_ym = (bin_edges_ym[:-1] + bin_edges_ym[1:]) / 2.0
bin_mean_h = (bin_edges_h[:-1] + bin_edges_h[1:]) / 2.0
# plt.figure()
# plt.scatter(y_comps[:, 1], y_comps[:, 0], alpha=0.5, c='y', label="Yellow-Yellow")
# plt.scatter(m_comps[:, 1], m_comps[:, 0], alpha=0.5, c='m', label="Magenta-Magenta")
# plt.scatter(ym_comps[:, 1], ym_comps[:, 0], alpha=0.5, c='g', label="Yellow-Magenta")
# plt.xlabel("Pairwise Euclidean distance")
# plt.ylabel("Kinship coefficient ")
# plt.xlim([0, max(bin_edges)])
# plt.axhline(0.25, label="Expected kinship PO/Full Sibs", color='y')
# plt.legend()
# plt.show()
plt.figure()
plt.plot(bin_mean, distance_mean_y, 'yo', label="Yellow-Yellow", markersize=10)
plt.plot(bin_mean_m, distance_mean_m, 'mo', label="Magenta-Magenta", markersize=10)
plt.plot(bin_mean_ym, distance_mean_ym, 'go', label="Magenta-Yellow", markersize=10)
plt.plot(bin_mean_h, distance_mean_h, 'wo', label="Hybrid-Hybrid", markersize=10)
plt.tick_params(labelsize=20)
plt.xlabel("Euclidean Distance", fontsize=28)
plt.ylabel("f", fontsize=28)
plt.legend()
plt.show()
plt.figure()
plt.plot(bin_mean, dist_std_y, 'ys', label="Yellow-Yellow", markersize=10)
plt.plot(bin_mean_m, dist_std_m, 'ms', label="Magenta-Magenta", markersize=10)
plt.plot(bin_mean_h, dist_std_h, 'ws', label="Hybrid-Hybrid", markersize=10)
plt.xlabel("Euclidean Distance")
plt.ylabel("Standard Deviation Kinship")
plt.legend()
plt.tick_params(labelsize=20)
plt.show()
def kinship_coeff(self, sample1, sample2, p):
'''Takes two samples as input(SNP-numpy arrays) and calculates the kinship coefficient
Additionally return the number of opposing homozygotes'''
working_SNPs = np.where(sample1[:] + sample2[:] > -1)[0] # Which entries to actually use
pi = 0.5 * sample1[working_SNPs]
# qi = 1 - pi
pj = 0.5 * sample2[working_SNPs]
# qj = 1 - pj
pt = p[working_SNPs] # Temporary p
# qt = 1 - pt
# estimator = np.mean((pi - pt) * (pj - pt) / pt + (qi - qt) * (qj - qt) / qt) # Multi allelic version
estimator = np.mean((pi - pt) * (pj - pt) / (pt * (1 - pt)))
homos = np.sum(np.absolute(pi - pj) == 1)
return (estimator, homos)
def kinship_coeff_loc_freq(self, sample1, sample2, p1, p2):
'''Implement an updated version of the kinship coefficient - assuming better knowledge of allele frequencies
and calculate pairwise relationship based on updated frequencies '''
p1_okay = np.minimum(p1, 2 - p1) > (2.0 * self.maf) # Everything times 2 since given all-freqs always times 2
p2_okay = np.minimum(p2, 2 - p2) > (2.0 * self.maf)
working_snps = sample1[:] + sample2[:] > -1 # Since -9 on bad entries
working_SNPs = np.where(p1_okay * p2_okay * working_snps)[0] # Which entries to actually use - all three categories have to be good (i.e.= 1)
pi = 0.5 * sample1[working_SNPs] # Get allele freqs per sample (normalize to 0,1)
pj = 0.5 * sample2[working_SNPs]
pt1, pt2 = 0.5 * p1[working_SNPs], 0.5 * p2[working_SNPs] # Estimated parental all frequs
pc = (pt1 + pt2) / 2.0 # Calculate the mean allele frequency (i.e. the "common" pool)
estimator = np.mean((pi - pt1) * (pj - pt2) / (pc * (1 - pc))) # Calculate the mean relatedness
homos = np.sum(np.absolute(pi - pj) == 1)
return (estimator, homos)
def create_offspring(self, sNPs1, sNPs2):
'''Combines two individuals randomly to a new individual'''
new_SNPs = [-9 for _ in range(0, len(sNPs1))] # Initialize new SNP, default is error
for k in range(0, len(sNPs1)):
if sNPs1[k] + sNPs2[k] > -1:
# Print draw random number with Prob. sNP1 and random number with Prob sNp2:
new_SNPs[k] = (np.random.binomial(1, sNPs1[k] / 2.0) + np.random.binomial(1, sNPs2[k] / 2.0))
return np.array(new_SNPs)
def plot_geography(self):
'''Plots geography of the data. Now with flower color!!!'''
plt.figure()
plt.scatter(self.coords[:, 0], self.coords[:, 1], c=self.color, alpha=0.5)
plt.ylim([-1000, 2000])
plt.axis("equal")
# plt.ylim([-500, 500])
# Draw PO-lines:
for i in self.po_list:
plt.plot([i[0][0], i[1][0]], [i[0][1], i[1][1]], c='r', linewidth=2)
plt.axvline(-250, color='k', linestyle='solid')
plt.axvline(400, color='k', linestyle='solid')
plt.tick_params(labelsize=22)
plt.grid()
plt.show()
def permute_colors(self, dist=60, neighbr_nr=100):
'''Permutes color values with values from neighbors. Save it in self.colors'''
color1 = [0 for _ in self.color] # New color vector
pool_cand = [] # Pool for possible candidates
for i in range(0, len(self.color)):
for j in range(max(i - neighbr_nr, 0), min(i + neighbr_nr, len(self.color) - 1)): # Iterate through possible neighbours
dist_ij = np.linalg.norm(self.coords[i, :] - self.coords[j, :])
if dist_ij < dist: # If Pairwise distance small enough, add to pool
pool_cand.append(j) # Append possible candidate
color1[i] = self.color[choice(pool_cand)] # Replace color; at least element itself is in List
pool_cand = [] # Delete Candidate List
self.color = np.array(color1)
print("Colors permuted")
def nbh_analysis(self, nbh_dist, min_nbh=6):
'''Determines for every individual its neighbors max. nbh_dist away and calculates their mean kinship coefficient.'''
data, p = self.data, self.p
nbh = [[] for _ in data[:, 0]] # Generates the neighborhood list
mean_corr = [] # Placeholder array
for i in range(0, len(nbh)): # Generate the neighborhood for every plant
if i % 100 == 0: # Do let programmer know where program is
print("Doing: %.1f" % i)
j = i # Index for other individual
while j > -1:
dist_ij = np.linalg.norm(self.coords[i, :] - self.coords[j, :])
if dist_ij < nbh_dist:
nbh[i].append(j)
if (self.coords[i, 0] - self.coords[j, 0]) > nbh_dist: break # If x-distance bigger than max distance.
else: j -= 1
j = i + 1 # Now go x-distances up (assuming everything is sorted by x-axis)
while j < len(nbh):
dist_ij = np.linalg.norm(self.coords[i, :] - self.coords[j, :])
if dist_ij < nbh_dist:
nbh[i].append(j)
if (self.coords[j, 0] - self.coords[i, 0]) > nbh_dist: break
else: j += 1
# Plot Size of the Nbhs:
nbh_tot = np.array([len(i) for i in nbh]).astype('int')
plt.figure()
plt.plot(self.coords[:, 0], nbh_tot, 'ro')
plt.xlabel("X-Coord")
plt.ylabel("Neighbors")
plt.show()
plt.figure()
plt.scatter(self.coords[:, 0], self.coords[:, 1], c=nbh_tot, alpha=0.5)
plt.colorbar()
plt.show()
for k in range(0, len(nbh)):
if k % 100 == 0: # To let programmer know where program is
print("Doing: %.1f" % k)
pair_comps = [] # Array for all pairwise Kinship estimates
for (i, j) in itertools.combinations(nbh[k], r=2):
estimator, _ = self.kinship_coeff(data[i, :], data[j, :], p) # Kinship estimate per pair, averaged over loci
pair_comps.append(estimator) # Append Kinship estimate
if len(nbh[k]) > min_nbh:
mean_corr.append(np.mean(pair_comps)) # Mean Kinship
else: mean_corr.append(-9) # Not enough individuals for reliable estimate
# Plot the Neighborhood-Kinship estimates:
mean_corr = np.array(mean_corr)
good_inds = mean_corr != -9
plt.figure()
plt.plot(self.coords[good_inds, 0], mean_corr[good_inds], 'ro')
plt.xlabel("X-Coord")
plt.ylabel("Kinship-Estimate")
plt.show()
mean_correlation, bin_edges, _ = binned_statistic(self.coords[good_inds, 0], mean_corr[good_inds], bins=20, statistic='mean') # Calculate Bin Values
plt.figure()
plt.scatter(self.coords[good_inds, 0], self.coords[good_inds, 1], c=mean_corr[good_inds], alpha=0.5, vmin=-0.01, vmax=0.05)
plt.xlim([min(self.coords[good_inds, 0]) - 50, max(self.coords[good_inds, 0]) + 50])
plt.ylim([min(self.coords[good_inds, 1]) - 50, max(self.coords[good_inds, 1]) + 60])
plt.colorbar()
plt.show()
bin_mean = (bin_edges[:-1] + bin_edges[1:]) / 2.0 # Calculate the mean of the bins
print(bin_mean)
print(mean_correlation)
plt.figure()
plt.plot(bin_mean, mean_correlation, 'ro')
plt.xlabel("x-Axis")
plt.ylabel("Mean correlation")
plt.grid()
plt.show()
def loc_all_freq_covariances(self, min_dist=30, max_dist=200):
'''Calculate and plot local allele frequency correlations'''
data, coords, p_mean = self.data, self.coords, self.data_p_mean # Extract filtered SNPs.
n = len(data[:, 0])
print("Calculate pairwise distances...") # Calculate pairwise distances
dist_mat = calc_distance_matrix(coords)
indices = (dist_mat < max_dist) * (dist_mat > min_dist) # Mark the individuals to do
neighbors = np.sum(indices, axis=1) # Quicker vectorized way
print("Neighbors: ")
print(neighbors)
print("Calculate average f per relevant pair...")
x = np.zeros(n) # Create empty vector for the data
has_nb = np.where(neighbors > 0)[0] # Create list where actually neighbors
for i in has_nb:
inds = np.where(indices[i, :] == 1)[0] # Where the distances are right
x[i] = np.mean([self.kinship_coeff_loc_freq(data[i, :], data[j, :], p_mean[i, :], p_mean[j, :])[0] for j in inds]) # Calculate the mean value per individual
print("Mean value: %.5f" % np.mean(x))
print("Smoothing...")
self.smooth_n_print(x, dist_mat, 150, coords, weights=neighbors, yval="Local f") # Smooth out
def x_dist_cov(self, min_dist=500, max_dist=1000):
'''Calculate and plot allele frequency correlations'''
data, coords, p_mean = self.data, self.coords, self.data_p_mean # Extract filtered SNPs.
n = len(data[:, 0])
dist_mat = calc_distance_matrix(coords)
dist_mat1 = np.array([[-coords[i, 0] + coords[j, 0] for i in range(n)] for j in range(n)])
# Mark the individuals for for comparison: Smaller than maximal distance, greater than min x-distance
indices = (dist_mat < max_dist) * (dist_mat1 > min_dist)
neighbors = np.sum(indices, axis=1) # Number of neighbours per ind
print("Calculate average f per relevant pair...")
x = np.zeros(n) # Create empty vector for the data
has_nb = np.where(neighbors > 0)[0] # Create list where actually neighbors
for i in has_nb:
inds = np.where(indices[i, :] == 1)[0] # Where the distances are right
x[i] = np.mean([self.kinship_coeff_loc_freq(data[i, :], data[j, :], p_mean[i, :], p_mean[j, :])[0] for j in inds]) # Calculate the mean value per individual
print("Mean value: %.5f" % np.mean(x))
print("Smoothing...")
self.smooth_n_print(x, dist_mat, 150, coords, weights=neighbors, yval="Mean Covariance")
def smooth_n_print(self, x, dist_mat, sigma, coords, weights=0, plot=True, yval="y-Val"):
'''Smooth out the given statistics x and plots against x-axis'''
indices_good = np.array(weights) > 10 # Only if enough neighbors (>10)
x_mean = self.calc_weighted_mean(dist_mat, x, weights, sigma)
if plot == False: return x_mean
plt.figure() # Plot against x-axis
plt.scatter(coords[indices_good, 0], x_mean[indices_good], color=self.color[indices_good], alpha=0.7, s=25)
plt.tick_params(labelsize=20)
plt.xlabel("x-Axis", fontsize=20)
plt.ylabel(yval, fontsize=20)
plt.show()
plt.scatter(self.coords[indices_good, 0], self.coords[indices_good, 1], c=x_mean[indices_good], alpha=0.7, s=25)
plt.tick_params(labelsize=20)
plt.colorbar()
plt.show()
def calc_weighted_mean(self, dist_mat, x, w, sigma):
'''Given a distance matrix and a vector of values,
calculate the weighted mean according to a kernel sigma.
Also weight with w'''
x, dist_mat, w = np.array(x), np.array(dist_mat), np.array(w) # Just in case that not numpy array
weights = 1 / (2.0 * np.pi * sigma ** 2) * np.exp(-dist_mat ** 2 / (2.0 * sigma ** 2)) # Calculate the Gaussian distance weights
weights = w[None, :] * weights # Bring in the number of neighbor weights
print(weights[:10, :10])
x_mean = np.dot(weights, x) / np.sum(weights, axis=1) # Calculate weighted mean
print(x_mean[:10])
return(x_mean)
def homozygosity_analysis(self):
'''Plots homozygosity per sample over all samples'''
data, p = self.data, self.p
homozyg = np.zeros(len(data[:, 0])) # Vector for homozygosity per sample
for i in range(0, len(data[:, 0])):
working_SNPs = np.where((data[i, :] > -1) & (np.minimum(p, 1 - p) > 0.05))[0] # Which entries to actually use; only snps with maf>0.5
p_i = data[i, working_SNPs] / 2.0
p_t = p[working_SNPs] # Temporal allele freq vector @UnusedVariable
# heteros = np.sum(p_i == 0.5) / float(len(p_i)) # Calculate mean heterozygosity
heteros = np.mean(2 * p_i * (1 - p_i) / (p_t * (1 - p_t)))
homozyg[i] = 1 - heteros
hom_mean, bin_edges, _ = binned_statistic(self.coords[:, 0], homozyg, bins=10, statistic='mean') # Calculate Bin Values
# Do the plot
print("Mean homozygosity %.6f:" % np.mean(homozyg))
plt.figure()
plt.scatter(self.coords[:, 0], self.coords[:, 1], c=homozyg, alpha=0.5)
# plt.xlim([min(self.coords[:, 0]) - 50, max(self.coords[:, 0]) + 50])
# plt.ylim([min(self.coords[:, 1]) - 50, max(self.coords[:, 1]) + 60])
plt.colorbar()
plt.show()
plt.figure()
plt.scatter(self.coords[:, 0], homozyg, c=self.color, alpha=0.7)
plt.xlabel("x-Coordinate")
plt.ylabel("Mean Homozygosity")
plt.hlines(hom_mean, bin_edges[:-1], bin_edges[1:], colors='g', lw=5, label='Binned data')
plt.axhline(np.mean(homozyg), label="Null line", color='k', linewidth=1)
plt.show()
def princomp(A):
""" performs principal components analysis
(PCA) on the n-by-p data matrix A
Rows of A correspond to observations, columns to variables.
Returns : coeff is a p-by-p matrix, each column containing coefficients
for one principal component. score :
the principal component scores; that is, the representation
of A in the principal component space. Rows of SCORE
correspond to observations, columns to components.
latent :
a vector containing the eigenvalues
of the covariance matrix of A."""
# computing eigenvalues and eigenvectors of covariance matrix
M = (A - np.mean(A.T, axis=1)).T # subtract the mean (along columns)
[latent, coeff] = np.linalg.eig(np.corrcoef(M)) # attention:not always sorted
score = np.dot(coeff.T, M) # projection of the data in the new space
return coeff, np.real(score), latent
def fit_log_linear(t, y):
'''Fitting exponential decay and returns parameters: y=A*Exp(-kt)'''
t = np.log(t)
param, V = np.polyfit(t, y, 1, cov=True) # Param has highest degree first
return param[1], param[0], np.sqrt(V[0, 0])
def fit_gaussian(values):
''' Fits a Gaussian to array containing values'''
param = norm.fit(values) # distribution fitting
return param # param[0] and param[1] are the mean and the standard deviation of the fitted distribution
def fit_poisson(values):
'''Fits a poisson distribution to array containing values'''
param = poisson.fit(values)
print(param)
def poisson(k, lamb):
# Poisson function, parameter lamb is the fit parameter
return (lamb ** k / factorial(k)) * np.exp(-lamb)
def calc_distance_matrix(coords):
'''Calculate the distance matrix between all coords. Requires numpy array of 2d coordinates'''
print("Calculating distance matrix")
start = time()
dist_mat = np.linalg.norm(coords[:, None] - coords, axis=2)
print("Time taken %.2f" % (time() - start))
return dist_mat