-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdfs.py
234 lines (186 loc) · 8.54 KB
/
dfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Super functions to handle doing several things to a dataframe at once.
"""
import os as _os
from tempfile import NamedTemporaryFile as _tf
import pandas as pd
import dbsnp
import rsids
NORM_FILE = rsids.NORM_FILE
DBSNP_FILE = rsids.DBSNP_FILE
###############################################################################
# Super Wrapper for Study DataFrames #
###############################################################################
def run_all(df, get_locs=False, rsid_col='rsID', risk_allele_col='risk_allele',
columns=None, convert_df=NORM_FILE, dbsnp_file=DBSNP_FILE):
"""Fully clean and normalize df and add chromosome locations.
Runs all clean functions to remove excess whitespace in every column,
coerce risk alleles into A, T, G, or C, and drop all others.
Converts rsIDs to most recent version and drop ones without an initial
'rs'.
Drops all columns in columns, defaults to:
['pmid', 'trait', 'rsID', 'population', 'risk_allele']
Attempts to add locations for all known rsIDs.
Returns a similar dataframe, cleaned up and with locations added.
Args:
df (DataFrame): The pandas dataframe to filter
get_locs (bool): Get locations after cleaning
rsid_col (str): The name os the rsid column
risk_allele_column (str): The name of the risk allele column.
columns (list): A list of columns names to drop if null.
convert_df (str): A file containing a normalization file from
make_norm_file()
dbsnp_file (str): The dbsnp lookup file from make_lookup_tables
should be the .rslookup.rs_sort.txt file
Returns:
DataFrame: A matching dataframe with rsids corrected, bad rows
(malformatted rsids) removed, and chrom, start, end columns
added.
"""
print('Dropping null columns')
df = drop_null(df, columns)
df = df.copy() # Avoid annoying warnings about slicing
if get_locs:
df = clean_df_add_locations(df, rsid_col, risk_allele_col, convert_df,
dbsnp_file)
else:
df = clean_df(df, rsid_col, risk_allele_col, convert_df)
return df
def clean_df(df, rsid_col='rsID', risk_allele_col='risk_allele',
convert_df=NORM_FILE):
"""Clean a dataframe of rsids and risk_alleles.
Args:
df (DataFrame): The pandas dataframe to filter
rsid_col (str): The name os the rsid column
risk_allele_column (str): The name of the risk allele column.
convert_df (str): A file containing a normalization file from
make_norm_file()
Returns:
DataFrame: A matching DataFrame with rsids and risk alleles corrected.
"""
print('Stripping whitespace')
df = clean_df_whitespace(df)
print('Cleaning risk alleles')
df = clean_risk_alleles(df, risk_allele_col)
print('Normalizing rsIDs')
df = rsids.clean_and_normalize_rsids(df, rsid_col, convert_df)
print('Force PMIDs to string')
df['pmid'] = df.pmid.astype(str)
print('Drop duplicates')
df = drop_duplicates(df)
return df
def clean_df_add_locations(df, rsid_col='rsID', risk_allele_col='risk_allele',
convert_df=NORM_FILE, dbsnp_file=DBSNP_FILE):
"""Fully clean and normalize df and add chromosome locations.
Runs clean_and_normalize_rsids() and then fetches location info.
Args:
df (DataFrame): The pandas dataframe to filter
rsid_col (str): The name os the rsid column
risk_allele_column (str): The name of the risk allele column.
convert_df (str): A file containing a normalization file from
make_norm_file()
dbsnp_file (str): The dbsnp lookup file from make_lookup_tables
should be the .rslookup.rs_sort.txt file
Returns:
DataFrame: A matching dataframe with rsids corrected, bad rows
(malformatted rsids) removed, and chrom, start, end columns
added.
"""
clean_df(df, rsid_col, risk_allele_col, convert_df)
print('Getting locations')
return clean_rsids_add_locations(df, rsid_col, convert_df, dbsnp_file)
###############################################################################
# Fetch Location Columns #
###############################################################################
def clean_rsids_add_locations(df, rsid_col='rsID',
convert_df=NORM_FILE, dbsnp_file=DBSNP_FILE):
"""Fully clean and normalize rsids and add chromosome location to df.
Runs normalize_rsids() and then fetches location info.
Args:
df (DataFrame): The pandas dataframe to filter
rsid_col (str): The name os the rsid column
convert_df (str): A file containing a normalization file from
make_norm_file()
dbsnp_file (str): The dbsnp lookup file from make_lookup_tables.
should be the .rslookup.rs_sort.txt file (zipped ok)
Returns:
DataFrame: A matching dataframe with rsids corrected, bad rows
(malformatted rsids) removed, and chrom, start, end columns
added.
"""
df[rsid_col] = rsids.normalize_rsids(df[rsid_col], convert_df)
tf = get_tempfile_name()
locations = dbsnp.join_rsid(df[rsid_col], dbsnp_file, tf, as_df=True)
_os.remove(tf)
if locations is not None:
print('{} of {} rows have new locations, merging'
.format(len(locations), len(df)))
return pd.merge(df, locations, how='left', left_on=rsid_col,
right_index=True)
else:
print('No rows have locations in dbSNP file')
return df
###############################################################################
# Clean DF #
###############################################################################
def drop_duplicates(df):
"""Sorts DF by source, drops all duplicates, and then sorts by rsID.
Duplicates are dropped by [pmid, trait, rsID, populatio, risk_allele]
Returns:
DataFrame: Clone of original dataframe with dups dropped.
"""
orig = len(df)
if 'source' in df.columns:
df.sort_values('source', inplace=True)
df.drop_duplicates(['pmid', 'trait', 'rsID', 'population', 'risk_allele'],
inplace=True)
if 'source' in df.columns:
df.sort_values('rsID', inplace=True)
final = len(df)
print('Dropped {} duplicates'.format(orig-final))
return df
def clean_df_whitespace(df):
"""Clean all columns of whitespace."""
for column in df.columns:
if df[column].dtype.name is 'object':
df[column] = df[column].astype(str).strip()
return df
def clean_risk_alleles(df, risk_allele_column='risk_allele'):
"""Coerce risk alleles into A,G,C,T or drop on failure.
Args:
df (DataFrame): Any pandas DataFrame.
risk_allele_column (str): The name of the risk allele column.
Returns:
DataFrame: Same DataFrame as df, with risk alleles cleaned and dropped.
"""
df[risk_allele_column] = df[risk_allele_column].str.upper()
df[risk_allele_column] = df[risk_allele_column].str.strip()
orig = len(df)
df = df[df[risk_allele_column].isin(list('ATGC'))]
final = len(df)
print('{} bad risk alleles dropped for not being A/T/G/C'
.format(orig-final))
return df
def drop_null(df, columns=None):
"""Drop all rows will null entries in any of columns."""
if not columns:
columns = ['pmid', 'trait', 'rsID', 'population', 'risk_allele']
print('Null rows:')
for i in columns:
print('\t', i, len(df[df[i].isnull()]))
orig = len(df)
df.dropna(subset=columns, inplace=True)
final = len(df)
print('Dropped {} null rows'.format(orig-final))
return df
###############################################################################
# Helper Functions #
###############################################################################
def get_tempfile_name():
"""Hack the tempfile system for a name."""
t = _tf('w')
tf = t.name
t.close()
return tf