forked from anthonyozerov/variant-filtering
-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
74 lines (59 loc) · 2.61 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import argparse
import pandas as pd
from family import *
from utils import *
if __name__ == '__main__':
argp = argparse.ArgumentParser()
argp.add_argument('-p', '--pedfile', default="Test_Ped.txt")
argp.add_argument('-d', '--data', default="Test_cleaned.txt")
argp.add_argument('-o', '--output', default="filtered.csv")
argp.add_argument('-op', '--output_phen', default="filtered_phen.csv")
argp.add_argument('-f', '--family', default="")
argp.add_argument('-ph', '--phenfile', default="Test_Phen.txt")
argp.add_argument('-m', '--mapfile', default="phenotype_to_genes.txt")
argp.add_argument('--nophen', default = False, action = 'store_true')
args = argp.parse_args()
# get a dict of families from the pedfile
families = get_families(args.pedfile)
if not args.nophen:
print("Getting relevant genes for family phenotypes...")
# give each family a list of genes relevant to their phenotype
load_phen(families, args.phenfile, args.mapfile)
# read in the file containing variants
df = pd.read_csv(args.data, sep='\t')
#check that there are no errors, and remove rows with errors.
df = verify(df)
# csv with variants in one family
if args.family != "":
fam = families[args.family]
fam_variants = df.copy()
for person in fam.people:
filt = filter_zyg if person.phen == "Unaffected" else exclude_zyg
fam_variants = filt(fam_variants, person.ID, "0/0")
fam_variants.to_csv(fam.ID + ".csv")
# empty dataframes for results with and without phenotype filter
result = pd.DataFrame()
result_p = pd.DataFrame()
for fam in families.values():
print("Filtering", fam.ID + '...')
# get a dataframe of variants for the family,
# without phenotype filter
famresult = filter_family(df, fam, phenfilter = False)
# append it to the results
result = pd.concat([result,famresult])
if not args.nophen:
# get a dataframe of variants for the family,
# with phenotype filter
famresult_p = filter_family(df, fam, phenfilter = True)
# append it to the results
result_p = pd.concat([result_p,famresult_p])
# organize result first by sample and then by inh model
result = result.sort_values(['sample', 'inh model'])
#save result
result.to_csv(args.output)
print(result)
#save result with phenotype filter
if not args.nophen:
result_p = result_p.sort_values(['family','phens_matched','sample'])
result_p.to_csv(args.output_phen)
print(result_p)