-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdbsnp.py
executable file
·293 lines (241 loc) · 11.1 KB
/
dbsnp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
A bunch of little functions to work with dbSNP files quickly.
Simple workflow:
1. Download dbSNP from NCBI
2. Concatenate all chromosome bed files into a single bed (eg. bed149.bed)
3. Run make_lookup_tables(): dbsnp create -o dbsnp149 <dbsnp149.bed>
4. While that is running, dump all your rsids to a file
5. Run join_rsid(): dbsnp rsids.txt dbsnp149.rslookup.rs_sort.txt rsids.joined.txt
The same thing can be done for locations, just dump your locations to a file
with the locations filtered as chr.position. Then provide either
dbsnp149.rslookup.start_sort.txt or dbsnp149.rslookup.end_sort.txt as the
dnsnp_file, use the 'end' file if your data is base-1 coded.
This file can also be imported and the functions used directly, this provides
the added advantage of being able to pass Series and get back DataFrames
instead of having to work with the files.
Note: This script will only work on linux systems, it makes use of the linux
tools sort, awk, cat, zcat, and join; these tools work differently on different
systems. Be sure to check all outputs for sanity.
Also, dbSNP is huge, and these commands are run in serial, so they will take
multiple minutes or more to complete, particularly when creating the lookup
files.
"""
import os as _os
import sys as _sys
import bz2 as _bz2
import gzip as _gzip
import argparse as _argparse
from random import randint as _rand
from subprocess import check_call as _call
from subprocess import STDOUT as _STDOUT
from subprocess import CalledProcessError as _call_err
import pandas as pd
__all__ = ['make_lookup_tables', 'join_rsid', 'join_location']
###############################################################################
# dbSNP Manipulation Functions #
###############################################################################
def join_rsid(rsids, dbsnp_file, outfile, sort=True, as_df=False):
"""Use linux join to create a lookup table of rsids.
Args:
rsids (str/list): List of rsids as a file name (string), list of rsid,
or Series.
dbsnp_file (str): The dbsnp lookup file from make_lookup_tables.
should be the .rslookup.rs_sort.txt file (zipped ok)
outfile (str): Name of outfile to write to
sort (bool): Pre sort the rsids
as_df (bool): Return a dataframe
Writes:
A tab separated table of rsid, chrom, start, end for all rsids.
Returns:
DataFrame: Dataframe of written table, only returned if as_df is true.
"""
if isinstance(rsids, pd.core.series.Series):
rsids = rsids.tolist()
if isinstance(rsids, (list, tuple, set)):
rsids = sorted(list(set(rsids)))
tmpfile = outfile + '.rsids.tmp'
with open(tmpfile, 'w') as fout:
fout.write('\n'.join(rsids))
rsids = tmpfile
else:
tmpfile = None
rsids = _os.path.abspath(rsids)
outfile = _os.path.abspath(outfile)
if sort:
print('Sorting')
cat = 'zcat' if rsids.endswith('gz') else 'cat'
tmpfile = 'tmpsort_{}'.format(_rand(1000,20000))
script = r"""{cat} {rsids} | sort > {tmp}; mv {tmp} {rsids}"""
_call(script.format(cat=cat, rsids=rsids, tmp=tmpfile), shell=True)
print('Joining')
script = r"""join {rsids} {dbsnp} > {outfile}"""
try:
_call(
script.format(rsids=rsids, dbsnp=dbsnp_file, outfile=outfile),
stderr=_STDOUT, shell=True, universal_newlines=True
)
except _call_err as exc:
print("Status : FAIL", exc.returncode, exc.output)
raise exc
print('Done, file {} has the joined list'.format(outfile))
if as_df:
print('Getting DataFrame')
try:
df = pd.read_csv(outfile, sep=' ', header=None, index_col=0)
except pd.io.common.EmptyDataError:
print('Joined file empty, skipping')
return None
df.index.name = None
df.columns = ['chrom', 'start', 'end']
return df
def join_location(locs, dbsnp_file, outfile, sort=True, as_df=False):
"""Use linux join to create a lookup table of rsids.
Args:
locs (str/list): File name of a file of chr.position, of a list of the
same. Can also give a Series of the same, or a
DataFrame with the columns 'chrom' and 'position'.
dbsnp_file (str): The dbsnp lookup file from make_lookup_tables.
should be the .rslookup.rs_sort.txt file (zipped ok)
outfile (str): Name of outfile to write to
sort (bool): Pre sort the rsids
as_df (bool): Return a dataframe
Writes:
A tab separated table of chrom.pos, rsid for all locations.
Returns:
DataFrame: Dataframe of written table, only returned if as_df is true.
"""
if isinstance(locs, pd.core.series.Series):
locs = sorted(list(set(locs.tolist())))
if isinstance(locs, pd.core.frame.DataFrame):
locs = locs.chrom.astype(str) + '.' + locs.position.astype(str)
locs = sorted(list(set(locs.tolist())))
if isinstance(locs, (list, tuple, set)):
tmpfile = outfile + 'locs.tmp'
with open(tmpfile, 'w') as fout:
fout.write('\n'.join(locs))
locs = tmpfile
else:
tmpfile = None
locs = _os.path.abspath(locs)
outfile = _os.path.abspath(outfile)
if sort:
print('Sorting')
cat = 'zcat' if locs.endswith('gz') else 'cat'
script = r"""{cat} {locs} | sort -k1,1 > tmp45876; mv tmp45876 {locs}"""
_call(script.format(cat=cat, locs=locs), shell=True)
print('Joining')
script = r"""join {locs} {dbsnp} > {outfile}"""
_call(script.format(locs=locs, dbsnp=dbsnp_file, outfile=outfile),
shell=True)
if tmpfile:
_os.remove(tmpfile)
if as_df:
print('Getting DataFrame')
try:
lookup = pd.read_csv(outfile, sep=' ', header=None, index_col=0)
except pd.io.common.EmptyDataError:
print('Joined file empty, skipping')
return None
lookup.index.name = None
lookup['chrom'], lookup['position'] = lookup.index.to_series().str.split('.', 1).str
lookup.columns = ['rsid', 'chrom', 'position']
lookup = lookup[['chrom', 'position', 'rsid']]
return lookup
def make_lookup_tables(dbsnp_file, outname='dbsnp_filtered',
skip_filter=False):
"""Create a series of easy lookup tables from dbSNP files."""
filtered_bed = '{}.snp_only.bed'.format(outname)
if not skip_filter:
print('Making filtered snp_only bed file')
with open_zipped(dbsnp_file) as fin, open(filtered_bed, 'w') as fout:
for line in fin:
if line.startswith('track'):
continue
chrom, start, end, name, _, strand = line.rstrip().split('\t')
if int(end)-int(start) > 1:
continue
fout.write('\t'.join([chrom, start, end, name, strand]) + '\n')
# Create a lookup file sorted by rsid
print('Making rsid lookup table')
script = r"""cat {name} | awk '{{print $4 "\t" $1 "\t" $2 "\t" $3}}' | sort -k1,1 > {name}.rslookup.rs_sort.txt"""
_call(script.format(name=filtered_bed), shell=True)
# Create a location lookup file sorted by the start location (base-1)
print('Making start location lookup table')
script = r"""cat {name} | awk '{{print $1"."$2 "\t" $4}}' | sort -k1,1 > {name}.rslookup.start_sort.txt"""
_call(script.format(name=filtered_bed), shell=True)
# Create a location lookup file sorted by the end location (base-1)
print('Making end location lookup table')
script = r"""cat {name} | awk '{{print $1"."$3 "\t" $4}}' | sort -k1,1 > {name}.rslookup.end_sort.txt"""
_call(script.format(name=filtered_bed), shell=True)
print('Done')
###############################################################################
# Helper Functions #
###############################################################################
def open_zipped(infile, mode='r'):
"""Return file handle of file regardless of zipped or not.
Text mode enforced for compatibility with python2.
"""
mode = mode[0] + 't'
p2mode = mode
if hasattr(infile, 'write'):
return infile
if isinstance(infile, str):
if infile.endswith('.gz'):
return _gzip.open(infile, mode)
if infile.endswith('.bz2'):
if hasattr(_bz2, 'open'):
return _bz2.open(infile, mode)
else:
return _bz2.BZ2File(infile, p2mode)
return open(infile, p2mode)
def make_arg_parser():
"""Parse command line arguments."""
parser = _argparse.ArgumentParser(
description=__doc__,
formatter_class=_argparse.RawDescriptionHelpFormatter)
subparsers = parser.add_subparsers(title='mode', dest='mode')
# Create lookup tables
create = subparsers.add_parser('create', aliases=['c'],
help="Create the lookup tables")
create.add_argument('dbsnp_file', help="dbSNP file, raw download.")
create.add_argument('-o', '--outname',
help="A name prefix for the output files.")
# rsid lookup
rsids = subparsers.add_parser('rsids', aliases=['rsid', 'r'],
help="Get locations by rsid")
rsids.add_argument('rsid_file', help='File of rsids to join')
rsids.add_argument('dbsnp_file', help="dbSNP rsid lookup file.")
rsids.add_argument('outfile', help="File to write output to.")
rsids.add_argument('--no-sort', action='store_false',
help="rsid_file is already sorted")
# location lookup
locs = subparsers.add_parser('location', aliases=['locs', 'loc', 'l'],
help="Get rsids by location")
locs.add_argument('loc_file', help="File of chr.location")
locs.add_argument('dbsnp_file',
help="dbSNP location lookup file be sure to chose " +
"the end location file if your data is base-1")
locs.add_argument('outfile', help="File to write output to.")
locs.add_argument('--no-sort', action='store_false',
help="loc_file is already sorted")
return parser
###############################################################################
# Running Directly #
###############################################################################
def main(argv=None):
"""Run as a script."""
# Command line args
parser = make_arg_parser()
if not argv:
argv = _sys.argv[1:]
args = parser.parse_args(argv)
if args.mode == 'create':
make_lookup_tables(args.dbsnp_file, args.outname)
if args.mode == 'rsids':
join_rsid(args.rsid_file, args.dbsnp_file, args.outfile, args.no_sort)
if args.mode == 'location':
join_location(args.loc_file, args.dbsnp_file, args.outfile, args.no_sort)
if __name__ == '__main__' and '__file__' in globals():
_sys.exit(main())