-
Notifications
You must be signed in to change notification settings - Fork 0
/
modifier.py
31 lines (23 loc) · 1.04 KB
/
modifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
list_batch = ['batch1', 'batch2','batch3','batch4']
dictionary = {}
for batch in list_batch:
dictionary[batch] = pd.read_pickle("./output/results/preprocessing/" + batch + "_geno.p").T
dictionary["batch4"]
hgcn = pd.read_csv("hgnc.txt", sep="\t")
#print(hgcn.head())
hg = hgcn[["Approved Symbol", "Entrez Gene ID(supplied by NCBI)"]]
hg.rename(columns = {"Approved Symbol":"Gene_Symbol", "Entrez Gene ID(supplied by NCBI)":"ENTREZ_ID"}, inplace=True)
hg = hg.dropna()
hg["ENTREZ_ID"] = hg["ENTREZ_ID"].astype(int).astype(str)
for h in dictionary.keys():
merge = hg.merge(dictionary[h], left_on="ENTREZ_ID", right_index=True)
print(merge.head())
#print("Original shape: "+str(h.shape[0]))
#print("Definit_shape: "+str(merge.shape[0]))
cols = list(merge.columns)
cols.remove("ENTREZ_ID")
merge = merge[cols]
print(merge.head)
merge.to_csv("./output/results/preprocessing/"+str(h)+"GS_geno.txt", sep="\t", index=False)
merge.to_pickle("./output/results/preprocessing/"+str(h)+"GS_geno.p")