Skip to content

Commit

Permalink
added degree and ratio sampling method
Browse files Browse the repository at this point in the history
  • Loading branch information
ctrlaltaf committed Aug 29, 2024
1 parent baf918c commit d617b81
Show file tree
Hide file tree
Showing 13 changed files with 481 additions and 174 deletions.
50 changes: 29 additions & 21 deletions classes/overlapping_neighbors_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ def __init__(self):

def get_y_score(self):
return self.y_score

def get_y_true(self):
return self.y_true

def set_y_score(self, y_score):
self.y_score = y_score

Expand Down Expand Up @@ -55,35 +55,54 @@ def predict(
"true_label": [],
}

positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
positive_dataset, negative_dataset = get_datasets(
input_directory_path, rep_num, name
)
G = import_graph_from_pickle(graph_file_path)
i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
for positive_protein, positive_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):
# calculate the score for the positive set
positive_protein_neighbor = get_neighbors(
G, positive_protein, ["protein_protein", "regulatory"]
)

# print("\nPositive protein neighbors: " + str(positive_pro_pro_neighbor))
positive_go_neighbor = get_neighbors(G, positive_go, ["protein_go_term"])
positive_go_annotated_protein_neighbor_count = (
get_go_annotated_protein_neighbor_count(
G, positive_protein_neighbor, positive_go
)
)

if len(positive_protein_neighbor) == 0:
positive_score = 0
else:
positive_score = (1 + positive_go_annotated_protein_neighbor_count) / (
len(positive_protein_neighbor) + len(positive_go_neighbor)
)

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

print_progress(i, len(positive_dataset["protein"]))
i += 1

for negative_protein, negative_go in zip(
negative_dataset["protein"],
negative_dataset["go"],
):

# calculate the score for the negative set
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -102,17 +121,6 @@ def predict(
len(negative_protein_neighbor) + len(negative_go_neighbor)
)

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["protein_neighbor"].append(len(negative_protein_neighbor))
Expand All @@ -123,7 +131,7 @@ def predict(
data["score"].append(negative_score)
data["true_label"].append(0)

print_progress(i, len(positive_dataset["protein"]))
print_progress(i, len(negative_dataset["protein"]))
i += 1

normalized_data = normalize(data["score"])
Expand Down Expand Up @@ -151,7 +159,7 @@ def get_neighbors(G: nx.DiGraph, node, edgeTypes):
for edge in res:
if edge[2]["type"] in edgeTypes:
neighborNode = [edge[1], edge[2]]
neighbors.append(neighborNode)
neighbors.append(neighborNode)

return neighbors

Expand Down
40 changes: 23 additions & 17 deletions classes/overlapping_neighbors_v2_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,15 @@ def predict(
"true_label": [],
}

positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
positive_dataset, negative_dataset = get_datasets(
input_directory_path, rep_num, name
)

G = import_graph_from_pickle(graph_file_path)
i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
for positive_protein, positive_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):
# calculate the score for the positive set
positive_protein_neighbor = get_neighbors(
Expand All @@ -82,6 +82,24 @@ def predict(
* positive_go_annotated_protein_neighbor_count
) / (len(positive_go_neighbor) / 2)

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

print_progress(i, len(positive_dataset["protein"]))
i += 1

for negative_protein, negative_go in zip(
negative_dataset["protein"],
negative_dataset["go"],
):
# calculate the score for the negative set
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -93,7 +111,6 @@ def predict(
)
)


if len(negative_go_neighbor) == 0:
negative_score = 0
else:
Expand All @@ -103,17 +120,6 @@ def predict(
* negative_go_annotated_protein_neighbor_count
) / (len(negative_go_neighbor) / 2)

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["protein_neighbor"].append(len(negative_protein_neighbor))
Expand All @@ -124,7 +130,7 @@ def predict(
data["score"].append(negative_score)
data["true_label"].append(0)

print_progress(i, len(positive_dataset["protein"]))
print_progress(i, len(negative_dataset["protein"]))
i += 1

normalized_data = normalize(data["score"])
Expand Down
35 changes: 20 additions & 15 deletions classes/overlapping_neighbors_v3_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,9 @@ def predict(
positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
G = import_graph_from_pickle(graph_file_path)

for positive_protein, positive_go, negative_protein, negative_go in zip(
for positive_protein, positive_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):
# calculate the score for the positive set
positive_protein_neighbor = get_neighbors(
Expand All @@ -79,6 +77,24 @@ def predict(
1 + positive_go_annotated_protein_neighbor_count
) / (len(positive_go_neighbor))

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

print_progress(i, len(positive_dataset["protein"]))
i += 1

for negative_protein, negative_go in zip(
negative_dataset["protein"],
negative_dataset["go"],
):
# calculate the score for the negative set
negative_protein_neighbor = get_neighbors(
G, negative_protein, "protein_protein"
Expand All @@ -96,17 +112,6 @@ def predict(
1 + negative_go_annotated_protein_neighbor_count
) / (len(negative_go_neighbor))

# input positive and negative score to data
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["protein_neighbor"].append(len(positive_protein_neighbor))
data["go_neighbor"].append(len(positive_go_neighbor))
data["go_annotated_protein_neighbors"].append(
positive_go_annotated_protein_neighbor_count
)
data["score"].append(positive_score)
data["true_label"].append(1)

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["protein_neighbor"].append(len(negative_protein_neighbor))
Expand All @@ -117,7 +122,7 @@ def predict(
data["score"].append(negative_score)
data["true_label"].append(0)

print_progress(i, len(positive_dataset["protein"]))
print_progress(i, len(negative_dataset["protein"]))
i += 1

normalized_data = normalize(data["score"])
Expand Down
13 changes: 9 additions & 4 deletions classes/protein_degree_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,22 +47,27 @@ def predict(
G = import_graph_from_pickle(graph_file_path)

i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
for positive_protein, positive_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):
data["protein"].append(positive_protein)
data["go_term"].append(positive_go)
data["degree"].append(G.degree(positive_protein))
data["true_label"].append(1)

print_progress(i, len(positive_dataset["protein"]))
i += 1

for negative_protein, negative_go in zip(
negative_dataset["protein"],
negative_dataset["go"],
):
data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(G.degree(negative_protein))
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
print_progress(i, len(negative_dataset["protein"]))
i += 1

normalized_data = normalize(data["degree"])
Expand Down
20 changes: 14 additions & 6 deletions classes/protein_degree_v2_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ def predict(
"true_label": [],
}

positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
positive_dataset, negative_dataset = get_datasets(
input_directory_path, rep_num, name
)
G = import_graph_from_pickle(graph_file_path)
i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
for positive_protein, positive_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):

data["protein"].append(positive_protein)
Expand All @@ -63,13 +63,20 @@ def predict(
)
data["true_label"].append(1)

print_progress(i, len(positive_dataset["protein"]))
i += 1

for negative_protein, negative_go in zip(
negative_dataset["protein"],
negative_dataset["go"],
):
data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(
len(get_neighbors(G, negative_protein, "protein_protein"))
len(get_neighbors(G, negative_protein, "protein_protein"))
)
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
print_progress(i, len(negative_dataset["protein"]))
i += 1

normalized_data = normalize(data["degree"])
Expand All @@ -90,6 +97,7 @@ def predict(

return y_score, y_true


def normalize(data):
data = np.array(data)
min_val = data.min()
Expand Down
18 changes: 13 additions & 5 deletions classes/protein_degree_v3_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ def predict(
"true_label": [],
}

positive_dataset, negative_dataset = get_datasets(input_directory_path, rep_num, name)
positive_dataset, negative_dataset = get_datasets(
input_directory_path, rep_num, name
)
G = import_graph_from_pickle(graph_file_path)
i = 1
for positive_protein, positive_go, negative_protein, negative_go in zip(
for positive_protein, positive_go in zip(
positive_dataset["protein"],
positive_dataset["go"],
negative_dataset["protein"],
negative_dataset["go"],
):

data["protein"].append(positive_protein)
Expand All @@ -64,13 +64,21 @@ def predict(
)
data["true_label"].append(1)

print_progress(i, len(positive_dataset["protein"]))
i += 1

for negative_protein, negative_go in zip(
negative_dataset["protein"],
negative_dataset["go"],
):

data["protein"].append(negative_protein)
data["go_term"].append(negative_go)
data["degree"].append(
len(get_neighbors(G, negative_protein, "protein_go_term"))
)
data["true_label"].append(0)
print_progress(i, len(positive_dataset["protein"]))
print_progress(i, len(negative_dataset["protein"]))
i += 1

normalized_data = normalize(data["degree"])
Expand Down
Loading

0 comments on commit d617b81

Please sign in to comment.