-
Notifications
You must be signed in to change notification settings - Fork 0
/
hack_23andMe.py
executable file
·186 lines (155 loc) · 6.77 KB
/
hack_23andMe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/python
"""
For info, call Paul
zero seven six four zero seven five seven nine six
or email
paulolivier at gmail
Assumes the following directory structure:
convert.py
a folder with 23andme files
Creates the following files:
a count file, counting SNPs
a PED file
a MAP file
"""
# Usage examples:
# 0) help:
# ./hack_23andMe.py -h
# 1) generate SNP counts:
# ./hack_23andMe.py SNP my_23_dir/*
# 2) generates PED file, setting phenotype to 1 for everyone in this file (2 is default), saves to final.{ped,map}:
# ./hack_23andMe.py --phenotype 1 PED my_23_dir/*
# (one can do twice 2), with different phenotype, and concatenate)
# Run plink:
# ./plink --file final
import os, sys, glob, argparse
def is_valid(filename):
"""Return the opensnp files that are large and have 23andme in their name
"""
fifteen_mb = 15 * 1000 * 1000
if os.path.getsize(filename) > fifteen_mb and "23andme" in filename:
return True
else:
print "Warning ", filename, " not large enough"
return False
def genome(filename):
"""
Parses the 23andme files and returns a list of (rsid, genotype)
"""
def process_line(line):
line = line.strip("\n").strip("\r")
try:
if "\t" in line:
rsid, chromosome, position, genotype = tuple(line.split("\t"))
else:
rsid, chromosome, position, genotype = tuple(line.split(" "))
except:
raise Exception(" problematic line: "+ line)
return rsid, genotype
with open(filename, "r") as f:
try:
genome = [process_line(line) for line in f.xreadlines() if line[0] != "#"]
except Exception as e:
raise Exception("File "+filename+" has " + str(e))
return genome
def genomes_iterator(filenames):
"""
Iterates over the files in filenames, and yields associated genomes
"""
for filename in filenames:
if is_valid(filename):
try:
yield filename, genome(filename)
print "Done ", filename
except Exception as e:
print "Skipping " + filename + str(e)
def snps(filenames, output ):
"""
Selects the SNPs that are present in cutoff percent of the genomes
"""
from collections import Counter
snp_count = Counter()
genomes_count = 0
for filename, genome in genomes_iterator(filenames):
for snp, value in genome:
snp_count[snp] += 1
genomes_count += 1
#counts.sort(key = lambda (x,y): y, reverse = True)
counts = snp_count.values()
print "Parsed ", genomes_count, " genomes"
print "For indication: "
counts_counter = Counter()
for count in counts:
counts_counter[count] += 1
for repetition, snp_counts in sorted(list(counts_counter.items()), reverse = True):
print " ", snp_counts, " times a snp appearing ", repetition, " times"
print "Writing ", output
with open(output, "w") as f:
for x,y in sorted(list(snp_count.items()), reverse = True, key = lambda (x,y) : y):
f.write(str(y).rjust(5, "0")+" "+ str(x)+ "\n")
def good_snps_reader(snp_count_file, cutoff_genome, output):
"""
Reads in good SNPs and creates MAP file
"""
cutoff = cutoff_genome/100.
with open(snp_count_file, "r") as f:
counts = map(lambda (count, snp) : (snp, int(count)), \
[line.strip("\n").strip("\r").split(" ") for line in f.xreadlines()])
max_count = max([y for (x,y) in counts])
good_snps = sorted([x for (x,y) in counts if y > max_count*cutoff])
print "Selected at ", cutoff_genome, " percent ", len(good_snps), " snps out of ", len(counts)
map_output = output.split(".")[0]+".map"
with open(map_output, "w") as f:
for snp in good_snps:
f.write("0 "+snp+ " 0 1000\n")
return good_snps
def convert_allele(allele):
"""
AA ---> A A
-A ---> 0 A
-- ---> 0 0
"""
tmp = allele.replace("-", "0")
if len(tmp) == 1:
tmp = 2 * tmp
tmp = " ".join(list(tmp))
return tmp
def ped(snp_count_file, cutoff_genome, phenotype_value, genome_filenames, output):
"""
Selects genomes that have all the SNPs, creates standard PED file
"""
phenotype_value = str(phenotype_value)
assert phenotype_value in ["2", "1"] # Expects 2, 1 for affected/unaffected
good_snps = good_snps_reader(snp_count_file, cutoff_genome, output)
selected, rejected = 0, 0
with open(output, "w") as f:
for filename, genome in genomes_iterator(genome_filenames):
genome_dict = dict(genome)
id = filename.split(os.path.sep)[-1].strip("23andme.txt")
try:
print "Selected ", filename
v = [convert_allele(genome_dict[good_snp]) for good_snp in good_snps]
selected += 1
f.write(" ".join([id, id, "0", "0", "0", phenotype_value])+ " " + " ".join(v) + "\n")
# family ID, individual ID, Paternal ID, Maternal ID, sex, phenotype
except:
print "Rejected", filename
# This guy is missing a good snp
rejected += 1
print "Rejected: ", rejected
print "Selected: ", selected
# I decided to create complex PED file so command line is simple
# THESE WOULD BE MORE COMPLEX OPTIONS TO USE: --compound-genotypes --no-parents --no-sex --no-fid --missing-genotype -
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Manipules 23andMe files.')
parser.add_argument('action', help = "Main action to perform. SNP creates an SNP count file, PED creates PED and MAP files", choices= ["SNP", "PED"])
parser.add_argument('--cutoff', type = int, nargs = 1, default = 90, help = 'Only consider SNPs that are present in this percent of genomes, only necessary for PED action')
parser.add_argument('--phenotype', default = 2, help = "the phenotype value to write to the final output; 2 is disease present, 1 is absent", choices= ["1", "2"])
parser.add_argument('--snpoutput', nargs = 1, default = 'SNPfile.txt', help = "The file to use as for output for SNP count procedure")
parser.add_argument('genomes', nargs = "*", help = "The 23andMe files to consider")
parser.add_argument('--pedoutput', default = 'output.ped', help = "The base filename to use as for output for PED procedure, for .ped and .map files")
args = parser.parse_args()
if args.action == "SNP":
snps(args.genomes, args.snpoutput)
if args.action == "PED":
ped(args.snpoutput, args.cutoff, args.phenotype, args.genomes, args.pedoutput)