Skip to content

Commit

Permalink
Fixed output file naming and preparing for pytest
Browse files Browse the repository at this point in the history
  • Loading branch information
ctrlaltaf committed Jun 11, 2024
1 parent 42cf252 commit 43f18bf
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 33 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,6 @@ env/
*.log

output/
input/
input/

.ipynb_checkpoints
2 changes: 1 addition & 1 deletion classes/hypergeometric_distribution_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def predict(
df = df.sort_values(by="norm_score", ascending=False)

df.to_csv(
Path(output_path, "hypergeometricdistribution.csv"),
Path(output_path, "hypergeometric_distribution.csv"),
index=False,
sep="\t",
)
Expand Down
2 changes: 1 addition & 1 deletion classes/hypergeometric_distribution_class_V2.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def predict(
df = df.sort_values(by="norm_score", ascending=False)

df.to_csv(
Path(output_path, "hypergeometricdistributionV3.csv"),
Path(output_path, "hypergeometric_distribution_V2.csv"),
index=False,
sep="\t",
)
Expand Down
2 changes: 1 addition & 1 deletion classes/hypergeometric_distribution_class_V3.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def predict(
df = df.sort_values(by="norm_score", ascending=False)

df.to_csv(
Path(output_path, "hypergeometricdistribution.csv"),
Path(output_path, "hypergeometric_distribution_v3.csv"),
index=False,
sep="\t",
)
Expand Down
2 changes: 1 addition & 1 deletion classes/hypergeometric_distribution_class_V4.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def predict(
df = df.sort_values(by="norm_score", ascending=False)

df.to_csv(
Path(output_path, "hypergeometricdistribution.csv"),
Path(output_path, "hypergeometric_distribution_v4_data.csv"),
index=False,
sep="\t",
)
Expand Down
13 changes: 9 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def main():
graph_file_path = Path(dataset_directory_path, "graph.pickle")
sample_size = 1000

testing_output_data_path = Path("./output/data/")
testing_output_image_path = Path("./output/images/")
testing_input_directory_path = Path("./tests/testing-dataset/")
testing_graph_file_path = Path(testing_input_directory_path, "graph.pickle")

interactome_columns = [0, 1, 4, 5]
interactome = read_specific_columns(interactome_path, interactome_columns, "\t")

Expand Down Expand Up @@ -81,10 +86,10 @@ def main():

results = run_workflow(
algorithm_classes,
dataset_directory_path,
graph_file_path,
output_data_path,
output_image_path,
testing_input_directory_path,
testing_graph_file_path,
testing_output_data_path,
testing_output_image_path,
True,
True,
)
Expand Down
44 changes: 25 additions & 19 deletions main_replicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,22 +75,24 @@ def main():
"HypergeometricDistributionV3": HypergeometricDistributionV3,
"HypergeometricDistributionV4": HypergeometricDistributionV4,
}
x = 20 #Number of replicates

x = 2 # Number of replicates
print_graphs = False
auc = {}
#index 0 is ROC, index 1 is Precision Recall
# index 0 is ROC, index 1 is Precision Recall
for i in algorithm_classes.keys():
auc[i] = [[],[]]
auc[i] = [[], []]

for i in range(x): #Creates a pos/neg list each replicate then runs workflow like normal
for i in range(
x
): # Creates a pos/neg list each replicate then runs workflow like normal
print("\n\nReplicate: " + str(i) + "\n")

# if there is no sample dataset, uncomment the following lines. otherwise, the dataset in outputs will be used
positive_dataset, negative_dataset = sample_data(
go_protein_pairs, sample_size, protein_list, G, dataset_directory_path
)

results = run_workflow(
algorithm_classes,
dataset_directory_path,
Expand All @@ -101,29 +103,33 @@ def main():
print_graphs,
)

#each loop adds the roc and pr values, index 0 for roc and 1 for pr, for each algorithm
# each loop adds the roc and pr values, index 0 for roc and 1 for pr, for each algorithm
for i in algorithm_classes.keys():
auc[i][0].append(results[i]['roc_auc'])
auc[i][1].append(results[i]['pr_auc'])
auc[i][0].append(results[i]["roc_auc"])
auc[i][1].append(results[i]["pr_auc"])

#Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
# Finds mean and sd of values, ROC mean index 0, ROC sd index 1, PR mean index 2, and PR sd index 3
for i in auc.keys():
meanROC = round(stat.mean(auc[i][0]),5)
auc[i].append(round(stat.mean(auc[i][1]),5))
auc[i].append(round(stat.stdev(auc[i][1]),5))
auc[i][1] = round(stat.stdev(auc[i][0]),5)
meanROC = round(stat.mean(auc[i][0]), 5)
auc[i].append(round(stat.mean(auc[i][1]), 5))
auc[i].append(round(stat.stdev(auc[i][1]), 5))
auc[i][1] = round(stat.stdev(auc[i][0]), 5)
auc[i][0] = meanROC

#Prints the roc and pr table, then saves to .csv file
df = pd.DataFrame.from_dict(auc, orient = 'index', columns = ['ROC mean', 'ROC sd', 'Precision/Recall mean', 'Precision/Recall sd'])
# Prints the roc and pr table, then saves to .csv file
df = pd.DataFrame.from_dict(
auc,
orient="index",
columns=["ROC mean", "ROC sd", "Precision/Recall mean", "Precision/Recall sd"],
)
print()
print(df)
df.to_csv(
Path(output_data_path, "auc_values.csv"),
Path(output_data_path, "repeated_auc_values.csv"),
index=True,
sep="\t",
)

sys.exit()


Expand Down
23 changes: 18 additions & 5 deletions tools/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def run_workflow(
if threshold:
run_thresholds(results, algorithm_classes, output_data_path)
if figures:
generate_figures(algorithm_classes, results, output_image_path)
generate_figures(algorithm_classes, results, output_image_path, output_data_path)

return results

Expand Down Expand Up @@ -135,12 +135,12 @@ def run_thresholds(results, algorithm_classes, output_data_path):
)


def generate_figures(algorithm_classes, results, output_image_path):
def generate_figures(algorithm_classes, results, output_image_path, output_data_path):
# Generate ROC and PR figures to compare methods

colors = generate_random_colors(len(algorithm_classes))

sorted_results = sort_results_by(results, "roc_auc")
sorted_results = sort_results_by(results, "roc_auc", output_data_path)
i = 0
plt.figure()
for algorithm_name, metrics in sorted_results.items():
Expand All @@ -162,7 +162,7 @@ def generate_figures(algorithm_classes, results, output_image_path):
plt.savefig(Path(output_image_path, "multiple_roc_curves.png"))
plt.show()

sorted_results = sort_results_by(results, "pr_auc")
sorted_results = sort_results_by(results, "pr_auc", output_data_path)
i = 0
plt.figure()
for algorithm_name, metrics in sorted_results.items():
Expand Down Expand Up @@ -245,12 +245,25 @@ def get_datasets(input_directory_path):
return positive_dataset, negative_dataset


def sort_results_by(results, key):
def sort_results_by(results, key, output_path):
algorithm_tuple_list = []
data = {"algorithm" : [], key: []}
output_file_path = Path(output_path, key + "_results.csv")

# make a list of tuples where a tuple is (algorithm_name, the metric we will be sorting by)
for algorithm_name, metrics in results.items():
algorithm_tuple_list.append((algorithm_name, metrics[key]))
data["algorithm"].append(algorithm_name)
data[key].append(metrics[key])

df = pd.DataFrame(data)
df = df.sort_values(by=key, ascending=False)

df.to_csv(
output_file_path,
index=False,
sep="\t",
)

algorithm_tuple_list = sorted(algorithm_tuple_list, key=itemgetter(1), reverse=True)

Expand Down

0 comments on commit 43f18bf

Please sign in to comment.