-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcall_data.py
130 lines (119 loc) · 7.18 KB
/
call_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
import argparse as ap
import numpy as np
import pandas as pd
import glob, os, sys
from skbio.stats import composition as cps
def read_params():
### we need to write somewgere that, if more tables
### for rhe same dataset are available it gets all of them
p = ap.ArgumentParser(\
description=\
"Utility for retrieving cMD 3 data given "
"a metadata table and a folder with profiles in it.", \
formatter_class=ap.RawTextHelpFormatter)
add = p.add_argument
add("metadata", type=str, help="Metadata table of your analysis. "
"The program guesses if it's row-wise or column-wise.")
add("data_directory", type=str, help="Dir-prefix for the tables")
add("-z", "--feat_id", type=str, help="Feature-identifier: if set, will use just this one. "
"Otherwise uses a series of defaults deduced by the name of the file. "
"Usefull mainly if you want genera (g__) as the default in relative_abundances "
"is species (s__)")
add("-re", "--release", type=str, default="2021-03-31") # 2021-03-31.BrooksB_2017.relative_abundance.tsv
add("-si", "--studyID", type=str, default="dataset_name")
add("-shr", "--shrink", action="store_true", help=\
"In the case of any taxonomic feature, it just uses that level (gives you shorter names of feaures)")
add("-sta", "--stratified", action="store_true", help=\
"In case if pathways, pathcoverages and genefamilies, it returns also contributors for each feature [default: False]")
add("-of", "--outfile", type=str, default="", help=\
"If not set, will be a dump of metadata + the data-types inferred from the profiles")
add("-a", "--apply_t", type=str, default="arcsin", choices=["CLR", "arcsin"], help=\
"You can specify one of two transformation you want (CLR or arcsin squareroot). "
"Note that if you already applied it before you musn't apply it twice (eve tough the program "
"outputs a warning).")
return p.parse_args()
def apply_transform_on_data(dataframe, T):
samples = dataframe.columns.tolist()
features = dataframe.index.tolist()
sums = np.sum(dataframe.values.astype(float), axis=0)
if np.isclose(np.sum(sums), 100*len(samples)):
sys.stdout.write("\n Relative abundances (summing to 100) detected. Proceeding...")
if T.lower() == "clr":
for sample in samples:
dataframe[sample] = cps.clr(cps.multiplicative_replacement(dataframe.loc[features, sample].values.astype(float)))
elif T.lower() == "arcsin":
for sample in samples:
dataframe[sample] = np.arcsin(np.sqrt(dataframe[sample].values.astype(float)/100.))
else:
raise NotImplementedError("Sorry: the transformation %s is not available yet." %T)
elif np.isclose(np.sum(sums), len(samples)):
sys.stdout.write("\n Relative abundances (summing to 1) detected. Proceeding...")
if T.lower() == "clr":
for sample in samples:
dataframe[sample] = cps.clr(cps.multiplicative_replacement(dataframe.loc[features, sample].values.astype(float)*100.))
elif T.lower() == "arcsin":
for sample in samples:
dataframe[sample] = np.arcsin(np.sqrt(dataframe[sample].values.astype(float)))
else:
raise NotImplementedError("Sorry: the transformation %s is not available yet." %T)
elif np.isclose(np.sum(sums), 0.0):
raise ValueError("\n Data summing to 0.0 with tol.: CLR already performed? Exiting...")
elif np.isclose( np.sum([((np.sin(dataframe.loc[:, sample].values.astype(float)))**2)*100 \
for sample in samples]), 100*len(samples)):
raise ValueError("\n Data summing to 100 after inverse of arcsin-sqrt. Exiting...")
return dataframe
def main(args):
data_types = {"relative_abundance": "taxon", "pathway_abundance": "pathway", "gene_families": "genefamily", "pathway_coverage": "coverage"}
default_feat_ids = {"taxon": "s__", "pathway": "PWY", "genefamily": "UniRef90", "KEGG": "K", "ECnumber": "EC"}
metadata = pd.read_csv(args.metadata, sep="\t", header=0, index_col=0, low_memory=False, engine="c").fillna("NA")
data = []
data_type_merged = []
featuress = set()
if "number_reads" in metadata.index.tolist():
metadata = metadata.T
for dat in metadata[args.studyID].unique():
metadata_this = metadata.loc[metadata[args.studyID]==dat]
study = dat
for datatable in glob.glob(os.path.join(args.data_directory, ".".join([args.release, study, "*", "tsv"]))):
data_type = data_types[ os.path.basename(datatable).split(".")[2] ]
feat_id = default_feat_ids[ data_type ]
if (not data_type in data_type_merged):
data_type_merged += [data_type]
datatab = pd.read_csv(datatable, sep="\t", header=0, index_col=0, low_memory=False, engine="c")
Features = [j for j in datatab.index.tolist() if (feat_id in j)]
if (data_type == "taxon") and (args.shrink):
feature_mapper = dict([ (ft,ft.split("|")[-1]) for ft in Features if ft.split("|")[-1].startswith(feat_id) ])
datatab.rename(index=feature_mapper, inplace=True)
new_Features = [feature_mapper[ft] for ft in feature_mapper]
datatab.drop([ft for ft in Features if (not ft in feature_mapper)], inplace=True, axis=0)
Features = new_Features
if (data_type in ["pathway", "genefamily", "coverage"]) and (not args.stratified):
new_Features = [ft for ft in Features if (not "|" in ft)]
datatab.drop([ft for ft in Features if (not ft in new_Features)], inplace=True, axis=0)
Features = new_Features
featuress = featuress | set(Features)
Samples = metadata_this.index.tolist()
datatab.columns = [(study + "_" + c) for c in datatab.columns.tolist()]
## Here I apply the transformation
datatab = apply_transform_on_data(datatab.loc[ Features, Samples ], args.apply_t)
### HERE WE MUST INSERT A COUPLE OF CHECKS
metadata_with_data = metadata_this.T.append( datatab )
if not len(data):
data = metadata_with_data
else:
data = data.merge(metadata_with_data, right_index=True, left_index=True, how="outer").fillna(0.0)
sys.stdout.write("\n Metadata+data addition of study %s, of shape: " %study)
sys.stdout.write(" ".join(map(str, list(metadata_with_data.shape))))
data = data.loc[[i for i in data.index.tolist() if (not i in featuress)] + list(featuress)]
data.index.name = "sampleID"
if (not args.outfile):
args.outfile = args.metadata.replace(".tsv", "_" + "_".join(data_type_merged) + ".tsv")
sys.stdout.write("\n Final table to save...")
sys.stdout.write("\n Number features = %i" %len(list(featuress)))
sys.stdout.write("\n Number samples = %i" %len(data.columns.tolist()))
sys.stdout.write("\n Tot.number metadata = %i" %(int(data.shape[0]) - len(list(featuress))))
sys.stdout.write("\n ... saving in %s.\n" %args.outfile)
data.to_csv(args.outfile, sep="\t", header=True, index=True)
if __name__ == "__main__":
main(read_params())