-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpathway_scores.py
117 lines (84 loc) · 3.32 KB
/
pathway_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
from get_symbols import *
import random
def binary_search(arr, target):
left, right = 0, len(arr) - 1
while left <= right:
mid = left + (right - left) // 2
# Check if the target is present at mid
if arr[mid] == target:
return mid
# If the target is greater, ignore left half
elif arr[mid] < target:
left = mid + 1
# If the target is smaller, ignore right half
else:
right = mid - 1
# If the element is not present in the array
return -1
def getExp():
df = pd.read_csv("/Users/wanbo/Desktop/selected_genes.csv")
# Your dictionary of pathways (pathway to set of genes)
pathway_genes = getDict()
# print(pathway_genes)
keys = pathway_genes.keys()
min = 1000
# reduce the number of genes
for key in pathway_genes:
length = len(pathway_genes[key])
if length<min:
min = length
# Dictionary to store pathway-wise gene arrays for each column
P = {}
df_excluded_first_column = df.iloc[:, 1:]
# Iterate over each pathway
for col_label, colum in df_excluded_first_column.items():
# Create an empty dictionary to store gene arrays for each column
P[col_label] = {}
# Iterate over each column in the DataFrame
for path in keys:
# length = len(pathway_genes[path])
# if length<min:
# min = length
genes = pathway_genes[path]
# Initialize an empty list to store genes in the pathway for this column
P[col_label][path] = []
first_col = df.iloc[:, 0]
# num_rows = len(first_col)
# Iterate over each gene in the column
for g in genes:
# Check if the gene belongs to the pathway
# for index, row in df.iterrows():
ind = binary_search(first_col,g)
if ind != -1:
P[col_label][path].append(df.iloc[ind][col_label])
# break
P[col_label][path].sort(reverse=True)
# Store the gene array for this column and pathway
# pathway_gene_arrays[pathway][column_name] = genes_in_column
print(P)
print(min)
G = P # result from previous code
for patient in G:
for pathway in G[patient]:
total = 0
sortedExp = G[patient][pathway]
n = len(sortedExp)
for i in range(n):
# if i > n:
# print(i, sortedExp)
score = sortedExp[i] * (n - i) / n
total += score
G[patient][pathway] = total
for path in keys:
# sumPath = sum([G[patient][path] for patient in G])
for patient in G:
G[patient][path] = G[patient][path]/sum([G[patient][path] for path in G[patient]])
return G
data = getExp()
df = pd.DataFrame.from_dict(data, orient='index')
transposed_df = df.transpose()
# Write the DataFrame to a CSV file
transposed_df.to_csv('MATRIXXX.csv')
# Now pathway_gene_arrays contains pathway-wise gene arrays for each column
# pathway_gene_arrays[pathway][column_name] gives the gene array for pathway and column_name