forked from ischrauwen-lab/variant-filtering
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filters.py
176 lines (151 loc) · 6.01 KB
/
filters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import pandas as pd
import numpy as np
# filter the dataFrame (df) by the minimum allele depth (ad) in a particular
# column (name)
def filter_AD(df, name, ad):
strings=np.array(df[name])
ADindices=[s.split(":").index("AD") for s in df["FORMAT"]]
ADs=[strings[i].split(":")[ADindices[i]] for i in range(len(strings))]
ADs=[int(ad.split(",")[1])/max(int(ad.split(",")[0]),1) for ad in ADs]
df["AD"]=ADs
df=df[df["AD"]>ad]
#print(len(df))
return df
# filter the dataFrame (df) by minimum depth in a particular column (name)
# if inplace is set to any integer other than 1, it will be filtered into a new data frame
# by default, the function filters in place when the inplace arg is left out of the function call
def filter_DP(df, name, dp, inplace=1):
strings = np.array(df[name])
DPindices=[s.split(":").index("DP") for s in df["FORMAT"]]
#DPs=[int(strings[i].split(":")[DPindices[i]]) for i in range(len(strings))]#######OLD###################
DPs = []
for i in range(len(strings)):
try:
dp_value = int(strings[i].split(":")[DPindices[i]])
DPs.append(dp_value)
except ValueError:
# Handle the case where the value is not a valid integer
DPs.append(0) # or any other appropriate handling logic
df["DP"]=DPs
if inplace == 1:
df=df[df["DP"] >= dp].copy()
#print(len(df))
del df["DP"]
return df
else:
dfcopy = df[df["DP"] >= dp].copy()
del dfcopy["DP"]
return dfcopy
# filter the dataFrame (df) by the maximum number of occurences (cap) of a
# particular zygosity (zyg), e.g. "0/1", in a range of columns
# [namestart,nameend]
def filter_occurences(df, zyg, namestart, nameend, cap):
freqs=[]
for i in range(df.columns.get_loc(namestart), df.columns.get_loc(nameend)+1):
mask=df.iloc[:,i].str.contains(zyg)
freqs.append(mask)
freqs=sum(freqs)
indices=[]
for key in freqs.keys():
if freqs[key]>cap:
indices.append(key)
df.drop(indices,inplace=True)
#print(len(df))
return df
# filter the dataFrame (df) by the maximum population allele frequency (cap)
def filter_AF(df, cap):
AF_columns=["AF_popmax","GME_AF","Kaviar_AF","abraom_freq"]
AF_columns = AF_columns + [col + ".1" for col in AF_columns]
for col in AF_columns:
if col in df.columns:
df.loc[df[col]==".",col]="-1"
df[col]=df[col].astype(float)
df=df[df[col]<=cap].copy()
#print(len(df))
return df
# filter the dataFrame (df) for the zygosity (zyg), e.g. "0/1", in a particular
# column (name)
def filter_zyg(df, name, zyg):
if name in df.columns:
df=df[df[name].str.contains(zyg)]
df=df.drop_duplicates()
return df
def filter_1x_zyg(df, name, zyg):
if name in df.columns:
df=df[df[name].str.startswith(zyg)]
df=df.drop_duplicates()
return df
# filter the dataFrame (df) to exclude a certain zygosity (zyg) in a particular
# column (name)
def exclude_zyg(df, name, zyg):
if name in df.columns:
df = df[~df[name].str.contains(zyg)]
df=df.drop_duplicates()
return df
def exclude_1x_zyg(df, name, zyg):
if name in df.columns:
df = df[~df[name].str.startswith(zyg)]
df=df.drop_duplicates()
return df
# filter out variants that are "Benign" or "Likely benign"
def filter_benign(df):
df=df[(df["CLNSIG"].str.contains("enign")==False)]
return df
# filter the dataFrame (df) for variants with a maximum DP across a list of affected people (names)
# that is greater than the minimum value (dp), a given constant.
# if inplace is 1, it filters df in place; if option is not 1, it filters into a new data frame
def filter_DP_Max(df, names, dp, inplace=1):
DPlist = []
for name in names:
strings = np.array(df[name])
DPindices=[s.split(":").index("DP") for s in df["FORMAT"]]
DPlist.append([int(strings[i].split(":")[DPindices[i]]) for i in range(len(strings))])
DPs = np.max(DPlist, 0)
df["DP"]=DPs
if inplace == 1:
df=df[df["DP"] >= dp].copy()
#print(len(df))
del df["DP"]
return df
else:
dfcopy = df[df["DP"] >= dp].copy()
del dfcopy["DP"]
return dfcopy
# filter the dataFramd (df) if you only want to keep the rows in which the gene is
# located in a particular chromosome (chrom)
def filter_chr(df, chrom, exclude = False):
if "Chr" in df.columns:
if exclude:
df = df[~df["Chr"].astype(str).str.contains(chrom)]
else:
df = df[~df["Chr"].isna() & df["Chr"].str.contains(chrom)]
return df
# get which gene in a string of genes (genestring) separated by ;
# is in a list of genes (famgenes), or -1 if none are
def gene_in_list(genestring, famgenes):
genes = genestring.split(';')
for i in range(0, len(genes)):
if genes[i] in famgenes:
return i
return -1
# filter the dataframe for only variants in genes associated with the Family
# object's (fam)'s phenotype
def filter_phen(df, fam):
if len(fam.genes) == 0:
return pd.DataFrame()
# get which gene in every gene string is in fam.genes
gene_locs = [gene_in_list(gene,fam.genes) for gene in df["Gene.refGene"].astype(str)]
# get a list of booleans:
# True if a gene in a gene string is in fam.genes, False otherwise
subset = [num != -1 for num in gene_locs]
# filter the dataframe by the subset
df = df[subset]
# shrink the gene_locs list to only include filtered values
gene_locs = [loc for (loc, include) in zip(gene_locs, subset) if include]
# use the Family's genes-to-n-associated-phenotypes dict to get a list of
# counts of associated phenotypes for each of the rows.
# the gene identified by gene_locs is the one passed into the dict.
counts = [fam.genes[gene.split(";")[loc]] for gene,loc in zip(df["Gene.refGene"],gene_locs)]
# insert a column containing these counts
df.insert(3, "phens_matched", counts)
return df