-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscript.py
122 lines (91 loc) · 3.07 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 11 12:28:25 2019
@author: tothp
"""
#%%
import numpy as np
import matplotlib.pyplot as pl
import os
folder = "C:\\projects\\gwas"
fname_genotypes = "popGen.txt"
fname_phenotypes = "AM_BV_Combined.csv"
fname_matching = "AM_Seq_SkodIDs.txt"
import pandas as pd
phenotype = pd.read_csv(os.path.join(folder, fname_phenotypes), delimiter=";")
# phenotype features of samples, mean over rings
# this is in the order of mum name
traits = {
trait: phenotype[phenotype["Trait"] == trait].filter(regex="Ring")
for trait in phenotype["Trait"].unique()
}
# this is in the order of mum name
delimiter = "\t"
phenotype_genotype_matching = pd.read_csv(
os.path.join(folder, fname_matching),
delimiter=delimiter,
header=None
).sort_values(1)
def get_alleles(line: str):
return line.split(delimiter)[3:]
def get_n_snps(path: str):
k = -1 # -1 for the header
with open(path) as file:
for line in file:
k += 1
return k
def check_row_col_consistency(path: str):
with open(path) as file:
for k, line in enumerate(file):
if k == 0:
n_cols = len(line.split(delimiter))
else:
if len(line.split(delimiter)) != n_cols:
print("data inconsistent")
print("first inconsistent row: {}".format(k))
print("data consistent")
return None
def get_unique_alleles(path: str):
unique_alleles = set()
with open(path) as file:
for k, line in enumerate(file):
if k != 0:
line = line.strip()
alleles = get_alleles(line)
unique_alleles.update(alleles)
return unique_alleles
def is_wildcard(allele: str):
return "*" in allele or "." in allele or "," in allele
def is_valid(allele: str):
return not is_wildcard(allele)
#%%
path_genotypes = os.path.join(folder, fname_genotypes)
n_snps = get_n_snps(path_genotypes)
check_row_col_consistency(path_genotypes)
#%%
header = open(path_genotypes).readline().split("\t")
sample_names = header[3:]
n_samples = len(sample_names)
unique_alleles = get_unique_alleles(path_genotypes)
#%%
wildcard_inds = np.nonzero([is_wildcard(a) for a in unique_alleles])[0]
#%%
ind_allele_mapping = {allele: i for i, allele in enumerate(unique_alleles)}
#%%
genotypes = np.zeros((n_samples, n_snps))
with open(path_genotypes) as file:
for k, line in enumerate(file):
if k != 0:
line = line.strip()
alleles = get_alleles(line)
genotypes[:, k - 1] = [ind_allele_mapping[i] for i in alleles]
if k % 10000 == 0:
print("{} lines processed".format(k))
#%% ./. is some missing value, get the number of these per row
missing_ind = 28
n_missing = np.sum(genotypes == ind_allele_mapping["./."], axis=1)
pl.plot(n_missing)
pl.xlabel("sample no.")
pl.ylabel("number of missing measurements")
#%% get the number of samples with enough valid measurements
ind_valid = [val for key, val in ind_allele_mapping.items() if is_valid(key)]