-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2-makeTFactorDf.py
72 lines (63 loc) · 2.66 KB
/
2-makeTFactorDf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from multiprocessing import Pool
filteredBedIntersectPath = "../results/bedIntersectWaWbTFBSinGenesFiltered.tsv"
humanGenesFPKMInTissues = "../input/table_Human_body_map_ze_FPKM.txt"
tfFPKMInTissuesPath = "../results/tfFPKMinTissues.tsv"
tissueNames = ["adipose_tissue", "adrenal_gland", "brain", "breast", "colon",
"heart", "kidney", "leukocyte", "liver", "lung", "lymph_node", "ovary",
"prostate", "skeletal_muscle", "testis", "thyriod"]
#random sample size: 100%
percent = 1.00
print("<Reading input data>")
filteredBedIntersectDf = pd.read_csv(filteredBedIntersectPath, sep="\t")
filteredBedIntersectDf = filteredBedIntersectDf.iloc[
np.random.choice(filteredBedIntersectDf.index, int(len(filteredBedIntersectDf)*percent))]
geneTFRelations = dict()
tfsOfGene = dict()
geneFpkmDf = pd.read_csv(humanGenesFPKMInTissues, sep="\t")
geneFpkmDf = geneFpkmDf.iloc[np.random.choice(geneFpkmDf.index,
int(len(geneFpkmDf)*percent))]
print("</Loaded input data>")
def getTFRow(tFactorName):
print("\tStarting for " + tFactorName)
tfDF = filteredBedIntersectDf[filteredBedIntersectDf.tfName == tFactorName]
newRow = dict()
newRow["tfName"] = tFactorName
nGenes = 0
bindingSites = 0
for tissue in tissueNames:
newRow[tissue] = 0.0
for index, row in tfDF.iterrows():
gName = row['geneName']
nBindingSites = row['count']
#this for loo is suposed to have only 1 iteration, unless there are 2
#or more genes with he same name
for geneIndex, geneRow in geneFpkmDf[geneFpkmDf.geneName == gName].iterrows():
nGenes += 1
bindingSites += nBindingSites
for tissue in tissueNames:
newRow[tissue] += geneRow[tissue]*nBindingSites
break
newRow["genesWithBS"] = nGenes
newRow["bindingSites"] = bindingSites
print("\tDone for " + tFactorName)
return newRow
print("<Creating rows>")
pool = Pool(processes = 5)
rows = pool.map(getTFRow, filteredBedIntersectDf.tfName.unique())
pool.close()
pool.join()
print("</Created rows>")
print("<Writing dataframe>")
columnNames = ["tfName"]
for tissue in tissueNames:
columnNames.append(tissue)
columnNames.append("genesWithBS")
df = pd.DataFrame(rows, columns=(['tfName', 'genesWithBS', 'bindingSites'] + tissueNames))
print("Computing mean values column")
df['mean'] = df[np.array(list(tissueNames))].mean(axis=1)
df['stDeviation'] = df[np.array(list(tissueNames))].std(axis=1)
df.to_csv(tfFPKMInTissuesPath, sep="\t", index=False)
print("</Dataframe saved at " + tfFPKMInTissuesPath + ">")