From 2964db171a72eae88445202531dbd6b015dee0da Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 22 Dec 2023 12:22:20 -0800 Subject: [PATCH 01/26] almost done with adding header files --- docker-wrappers/Cytoscape/cytoscape_util.py | 4 +++- spras/allpairs.py | 4 +++- spras/analysis/graphspace.py | 8 ++++---- spras/analysis/ml.py | 7 +++++-- spras/analysis/summary.py | 13 +++++++++++-- spras/domino.py | 6 ++++-- spras/meo.py | 6 +++--- spras/mincostflow.py | 3 ++- spras/omicsintegrator1.py | 6 +++--- spras/omicsintegrator2.py | 3 ++- spras/pathlinker.py | 3 ++- ...egfr-omicsintegrator1-params-3THRXWW_pathway.txt | 1 + ...egfr-omicsintegrator1-params-5QH767V_pathway.txt | 1 + ...egfr-omicsintegrator1-params-ITO5EQS_pathway.txt | 1 + .../tps-egfr-pathlinker-params-7S4SLU6_pathway.txt | 1 + .../tps-egfr-pathlinker-params-TCEMRS7_pathway.txt | 1 + .../example/data0-meo-params-GKEDDFZ_pathway.txt | 1 + ...ata0-omicsintegrator1-params-RQCQ4YN_pathway.txt | 1 + ...ata0-omicsintegrator1-params-WY4V42C_pathway.txt | 1 + ...ata0-omicsintegrator2-params-IV3IPCJ_pathway.txt | 1 + .../data0-pathlinker-params-6SWY7JS_pathway.txt | 1 + .../data0-pathlinker-params-VQL7BDZ_pathway.txt | 1 + .../example/data1-meo-params-GKEDDFZ_pathway.txt | 1 + ...ata1-omicsintegrator1-params-JAZWLAK_pathway.txt | 1 + ...ata1-omicsintegrator1-params-PU62FNV_pathway.txt | 1 + ...ata1-omicsintegrator2-params-IV3IPCJ_pathway.txt | 1 + .../data1-pathlinker-params-6SWY7JS_pathway.txt | 1 + .../data1-pathlinker-params-VQL7BDZ_pathway.txt | 1 + test/analysis/input/standardized-ranked.txt | 1 + test/analysis/input/toy/network1.txt | 1 + test/analysis/input/toy/network2.txt | 1 + test/analysis/input/toy/network3.txt | 1 + test/analysis/input/toy/network4.txt | 1 + test/analysis/input/toy/network5.txt | 1 + test/analysis/output/example_summary.txt | 6 ------ test/ml/input/test-data-longName/longName.txt | 1 + test/ml/input/test-data-longName2/longName2.txt | 1 + test/ml/input/test-data-s1/s1.txt | 1 + test/ml/input/test-data-s2/s2.txt | 1 + test/ml/input/test-data-s3/s3.txt | 1 + test/ml/input/test-data-spaces/spaces.txt | 1 + .../input/test-mixed-direction/mixed-direction.txt | 1 + .../expected/allpairs-pathway-expected.txt | 1 + .../expected/domino-pathway-expected.txt | 1 + .../parse-outputs/expected/meo-pathway-expected.txt | 1 + .../expected/mincostflow-pathway-expected.txt | 1 + .../expected/omicsintegrator1-pathway-expected.txt | 1 + .../expected/omicsintegrator2-pathway-expected.txt | 1 + .../expected/pathlinker-pathway-expected.txt | 1 + 49 files changed, 79 insertions(+), 27 deletions(-) diff --git a/docker-wrappers/Cytoscape/cytoscape_util.py b/docker-wrappers/Cytoscape/cytoscape_util.py index dcf110f1..c8f47922 100644 --- a/docker-wrappers/Cytoscape/cytoscape_util.py +++ b/docker-wrappers/Cytoscape/cytoscape_util.py @@ -116,7 +116,9 @@ def load_pathways(pathways: List[str], output: str) -> None: suid = p4c.networks.import_network_from_tabular_file( file=path, column_type_list='s,t,x,ea', - delimiters='\t' + delimiters='\t', + first_row_as_column_names = True, + start_load_row = 2, ) p4c.networks.rename_network(name, network=suid) diff --git a/spras/allpairs.py b/spras/allpairs.py index 3de9029d..38f7f000 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -110,6 +110,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param standardized_pathway_file: the same pathway written in the universal format """ df = pd.read_csv(raw_pathway_file, sep='\t', header=None) + df['Rank'] = 1 # add a rank column of 1s since the edges are not ranked. df = reinsert_direction_col_undirected(df) - df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t') + df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/analysis/graphspace.py b/spras/analysis/graphspace.py index ba87de6e..bee103f6 100644 --- a/spras/analysis/graphspace.py +++ b/spras/analysis/graphspace.py @@ -77,21 +77,21 @@ def load_graph(path: str) -> Tuple[Union[nx.Graph, nx.DiGraph], bool]: directed = False try: - pathways = pd.read_csv(path, sep="\t", header=None) + pathways = pd.read_csv(path, sep="\t", header=0) except pd.errors.EmptyDataError: print(f"The file {path} is empty.") return G, directed - pathways.columns = ["Interactor1", "Interactor2", "Rank", "Direction"] + mask_u = pathways['Direction'] == 'U' mask_d = pathways['Direction'] == 'D' pathways.drop(columns=["Direction"]) if mask_u.all(): - G = nx.from_pandas_edgelist(pathways, "Interactor1", "Interactor2", ["Rank"]) + G = nx.from_pandas_edgelist(pathways, "Node1", "Node2", ["Rank"]) directed = False elif mask_d.all(): - G = nx.from_pandas_edgelist(pathways, "Interactor1", "Interactor2", ["Rank"], create_using=nx.DiGraph()) + G = nx.from_pandas_edgelist(pathways, "Node1", "Node2", ["Rank"], create_using=nx.DiGraph()) directed = True else: print(f"{path} could not be visualized. GraphSpace does not support mixed direction type graphs currently") diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 64260ad6..2c129873 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -44,7 +44,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra edges = [] for line in lines: parts = line.split('\t') - if len(parts) > 0: # in case of empty line in file + if len(parts) >= 4: # in case of empty line in file or line doesn't include all values node1 = parts[0] node2 = parts[1] direction = str(parts[3]).strip() @@ -55,7 +55,10 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra # node order does matter for directed edges edges.append(DIR_CONST.join([node1, node2])) else: - ValueError(f"direction is {direction}, rather than U or D") + if direction == 'Direction': # if reading the header + continue + else: + raise ValueError(f"direction is {direction}, rather than U or D") # getting the algorithm name p = PurePath(file) diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index 9b0d797d..d0bc9582 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -33,8 +33,17 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> # Iterate through each network file path for file_path in sorted(file_paths): - # Load in the network - nw = nx.read_edgelist(file_path, data=(('weight', float), ('Direction',str))) + nw = None + # nw = nx.read_edgelist(file_path, data=(('weight', float), ('Direction',str))) + if os.path.getsize(file_path) == 0: + continue + else: + with open(file_path, 'r') as f: + # skip the header line + next(f) + # Load in the network + nw = nx.read_edgelist(f, data=(('weight', float), ('Direction', str))) + # Save the network name, number of nodes, number edges, and number of connected components nw_name = str(file_path) number_nodes = nw.number_of_nodes() diff --git a/spras/domino.py b/spras/domino.py index 53434e3f..4666bf83 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -205,8 +205,10 @@ def parse_output(raw_pathway_file, standardized_pathway_file): edges_df['source'] = edges_df['source'].apply(post_domino_id_transform) edges_df['target'] = edges_df['target'].apply(post_domino_id_transform) edges_df = reinsert_direction_col_undirected(edges_df) - - edges_df.to_csv(standardized_pathway_file, sep='\t', header=False, index=False) + edges_df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] + edges_df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False) + else: + edges_df.to_csv(standardized_pathway_file, sep='\t', header=None, index=False) def pre_domino_id_transform(node_id): diff --git a/spras/meo.py b/spras/meo.py index d9cf2f24..2d5d8741 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -188,6 +188,6 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # Would need to load the paths output file to rank edges correctly df = add_rank_column(df) df = reinsert_direction_col_directed(df) - - df.to_csv(standardized_pathway_file, columns=['Source', 'Target', 'Rank', "Direction"], header=False, - index=False, sep='\t') + df.drop(columns=['Type', 'Oriented', 'Weight'], inplace = True) + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 24ee9677..a1e18942 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -154,5 +154,6 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # TODO update MinCostFlow version to support mixed graphs # Currently directed edges in the input will be converted to undirected edges in the output df = reinsert_direction_col_undirected(df) - df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t') + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 6e2c6807..bf619093 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -201,6 +201,6 @@ def parse_output(raw_pathway_file, standardized_pathway_file): df.columns = ["Edge1", "InteractionType", "Edge2"] df = add_rank_column(df) df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") - - df.to_csv(standardized_pathway_file, columns=['Edge1', 'Edge2', 'Rank', "Direction"], header=False, index=False, - sep='\t') + df.drop(columns=['InteractionType'], inplace = True) + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 5099b8d9..1b953600 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -155,4 +155,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file): df = df.take([0, 1], axis=1) df = add_rank_column(df) df = reinsert_direction_col_undirected(df) - df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t') + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 85b38fe9..0147dc40 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -140,4 +140,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # What about multiple raw_pathway_files df = pd.read_csv(raw_pathway_file, sep='\t').take([0, 1, 2], axis=1) df = reinsert_direction_col_directed(df) - df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t') + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt b/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt index 03571eae..44944b37 100644 --- a/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt +++ b/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction ABI1_HUMAN MK01_HUMAN 1 U CBLB_HUMAN EGFR_HUMAN 1 U CBL_HUMAN CD2AP_HUMAN 1 U diff --git a/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt b/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt index 30d107b0..b2033b57 100644 --- a/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt +++ b/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction ABI1_HUMAN MK01_HUMAN 1 U CBLB_HUMAN EGFR_HUMAN 1 U CBL_HUMAN CD2AP_HUMAN 1 U diff --git a/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt b/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt index 065ef6f9..e0adf2fc 100644 --- a/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt +++ b/test/analysis/input/egfr/tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction ABI1_HUMAN MK01_HUMAN 1 U CBL_HUMAN CD2AP_HUMAN 1 U CBL_HUMAN CRKL_HUMAN 1 U diff --git a/test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt b/test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt index 899147f8..bc9dfc85 100644 --- a/test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt +++ b/test/analysis/input/egfr/tps-egfr-pathlinker-params-7S4SLU6_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction EGF_HUMAN EGFR_HUMAN 1 U EGF_HUMAN S10A4_HUMAN 2 U S10A4_HUMAN MYH9_HUMAN 2 U diff --git a/test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt b/test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt index 3b1ddef5..a1738b00 100644 --- a/test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt +++ b/test/analysis/input/egfr/tps-egfr-pathlinker-params-TCEMRS7_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction EGF_HUMAN EGFR_HUMAN 1 U EGF_HUMAN S10A4_HUMAN 2 U S10A4_HUMAN MYH9_HUMAN 2 U diff --git a/test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt b/test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt index 9d65620f..5547a49c 100644 --- a/test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt +++ b/test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction A B 1 D B C 1 D diff --git a/test/analysis/input/example/data0-omicsintegrator1-params-RQCQ4YN_pathway.txt b/test/analysis/input/example/data0-omicsintegrator1-params-RQCQ4YN_pathway.txt index e2fd8d57..21768464 100644 --- a/test/analysis/input/example/data0-omicsintegrator1-params-RQCQ4YN_pathway.txt +++ b/test/analysis/input/example/data0-omicsintegrator1-params-RQCQ4YN_pathway.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction A B 1 U B C 1 U diff --git a/test/analysis/input/example/data0-omicsintegrator1-params-WY4V42C_pathway.txt b/test/analysis/input/example/data0-omicsintegrator1-params-WY4V42C_pathway.txt index e2fd8d57..21768464 100644 --- a/test/analysis/input/example/data0-omicsintegrator1-params-WY4V42C_pathway.txt +++ b/test/analysis/input/example/data0-omicsintegrator1-params-WY4V42C_pathway.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction A B 1 U B C 1 U diff --git a/test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt b/test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt index 65f6f221..e34eeaff 100644 --- a/test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt +++ b/test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction B A 1 U B C 1 U diff --git a/test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt b/test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt index 9d65620f..5547a49c 100644 --- a/test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt +++ b/test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction A B 1 D B C 1 D diff --git a/test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt b/test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt index 9d65620f..5547a49c 100644 --- a/test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt +++ b/test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction A B 1 D B C 1 D diff --git a/test/analysis/input/example/data1-meo-params-GKEDDFZ_pathway.txt b/test/analysis/input/example/data1-meo-params-GKEDDFZ_pathway.txt index 71ed6ccf..a87a0437 100644 --- a/test/analysis/input/example/data1-meo-params-GKEDDFZ_pathway.txt +++ b/test/analysis/input/example/data1-meo-params-GKEDDFZ_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 D B C 1 D A D 1 D diff --git a/test/analysis/input/example/data1-omicsintegrator1-params-JAZWLAK_pathway.txt b/test/analysis/input/example/data1-omicsintegrator1-params-JAZWLAK_pathway.txt index afbe030d..885a8574 100644 --- a/test/analysis/input/example/data1-omicsintegrator1-params-JAZWLAK_pathway.txt +++ b/test/analysis/input/example/data1-omicsintegrator1-params-JAZWLAK_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A D 1 U G H 1 U G I 1 U diff --git a/test/analysis/input/example/data1-omicsintegrator1-params-PU62FNV_pathway.txt b/test/analysis/input/example/data1-omicsintegrator1-params-PU62FNV_pathway.txt index afbe030d..885a8574 100644 --- a/test/analysis/input/example/data1-omicsintegrator1-params-PU62FNV_pathway.txt +++ b/test/analysis/input/example/data1-omicsintegrator1-params-PU62FNV_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A D 1 U G H 1 U G I 1 U diff --git a/test/analysis/input/example/data1-omicsintegrator2-params-IV3IPCJ_pathway.txt b/test/analysis/input/example/data1-omicsintegrator2-params-IV3IPCJ_pathway.txt index eddad79c..069481df 100644 --- a/test/analysis/input/example/data1-omicsintegrator2-params-IV3IPCJ_pathway.txt +++ b/test/analysis/input/example/data1-omicsintegrator2-params-IV3IPCJ_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction C D 1 U C F 1 U A D 1 U diff --git a/test/analysis/input/example/data1-pathlinker-params-6SWY7JS_pathway.txt b/test/analysis/input/example/data1-pathlinker-params-6SWY7JS_pathway.txt index 92b60b6e..ec070652 100644 --- a/test/analysis/input/example/data1-pathlinker-params-6SWY7JS_pathway.txt +++ b/test/analysis/input/example/data1-pathlinker-params-6SWY7JS_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 D B C 1 D A D 2 D diff --git a/test/analysis/input/example/data1-pathlinker-params-VQL7BDZ_pathway.txt b/test/analysis/input/example/data1-pathlinker-params-VQL7BDZ_pathway.txt index 92b60b6e..ec070652 100644 --- a/test/analysis/input/example/data1-pathlinker-params-VQL7BDZ_pathway.txt +++ b/test/analysis/input/example/data1-pathlinker-params-VQL7BDZ_pathway.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 D B C 1 D A D 2 D diff --git a/test/analysis/input/standardized-ranked.txt b/test/analysis/input/standardized-ranked.txt index c432c386..27f8222f 100644 --- a/test/analysis/input/standardized-ranked.txt +++ b/test/analysis/input/standardized-ranked.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 U A C 3 U C D 5 U diff --git a/test/analysis/input/toy/network1.txt b/test/analysis/input/toy/network1.txt index 21847821..bd5bd343 100644 --- a/test/analysis/input/toy/network1.txt +++ b/test/analysis/input/toy/network1.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 U C D 1 U E F 1 U diff --git a/test/analysis/input/toy/network2.txt b/test/analysis/input/toy/network2.txt index f7811bc4..7506195d 100644 --- a/test/analysis/input/toy/network2.txt +++ b/test/analysis/input/toy/network2.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 U B C 1 U C D 1 U diff --git a/test/analysis/input/toy/network3.txt b/test/analysis/input/toy/network3.txt index cbf42fb5..eaf05c07 100644 --- a/test/analysis/input/toy/network3.txt +++ b/test/analysis/input/toy/network3.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 U A C 1 U A D 1 U diff --git a/test/analysis/input/toy/network4.txt b/test/analysis/input/toy/network4.txt index d711ec1a..61ed9a4b 100644 --- a/test/analysis/input/toy/network4.txt +++ b/test/analysis/input/toy/network4.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 U B C 1 U D E 1 U diff --git a/test/analysis/input/toy/network5.txt b/test/analysis/input/toy/network5.txt index 5aaf5c0b..3d0eaf8c 100644 --- a/test/analysis/input/toy/network5.txt +++ b/test/analysis/input/toy/network5.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 U B C 1 U C D 1 U diff --git a/test/analysis/output/example_summary.txt b/test/analysis/output/example_summary.txt index f34497a4..fde30449 100644 --- a/test/analysis/output/example_summary.txt +++ b/test/analysis/output/example_summary.txt @@ -1,19 +1,13 @@ Name Number of nodes Number of undirected edges Number of connected components Nodes in sources Nodes in targets test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 1 1 -test/analysis/input/example/data0-omicsintegrator1-params-JAZWLAK_pathway.txt 0 0 0 0 0 -test/analysis/input/example/data0-omicsintegrator1-params-PU62FNV_pathway.txt 0 0 0 0 0 test/analysis/input/example/data0-omicsintegrator1-params-RQCQ4YN_pathway.txt 3 2 1 1 1 test/analysis/input/example/data0-omicsintegrator1-params-WY4V42C_pathway.txt 3 2 1 1 1 -test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0 0 test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 1 1 test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 1 1 test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 1 1 test/analysis/input/example/data1-meo-params-GKEDDFZ_pathway.txt 4 4 1 1 2 test/analysis/input/example/data1-omicsintegrator1-params-JAZWLAK_pathway.txt 5 3 2 1 3 test/analysis/input/example/data1-omicsintegrator1-params-PU62FNV_pathway.txt 5 3 2 1 3 -test/analysis/input/example/data1-omicsintegrator1-params-RQCQ4YN_pathway.txt 0 0 0 0 0 -test/analysis/input/example/data1-omicsintegrator1-params-WY4V42C_pathway.txt 0 0 0 0 0 -test/analysis/input/example/data1-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0 0 test/analysis/input/example/data1-omicsintegrator2-params-IV3IPCJ_pathway.txt 7 6 1 1 4 test/analysis/input/example/data1-pathlinker-params-6SWY7JS_pathway.txt 4 3 1 1 2 test/analysis/input/example/data1-pathlinker-params-VQL7BDZ_pathway.txt 4 3 1 1 2 diff --git a/test/ml/input/test-data-longName/longName.txt b/test/ml/input/test-data-longName/longName.txt index aabf41b2..7e120dff 100644 --- a/test/ml/input/test-data-longName/longName.txt +++ b/test/ml/input/test-data-longName/longName.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction node1 node2 1 U node1 node3 1 U node4 node5 1 U diff --git a/test/ml/input/test-data-longName2/longName2.txt b/test/ml/input/test-data-longName2/longName2.txt index 8765175f..35bf0c2e 100644 --- a/test/ml/input/test-data-longName2/longName2.txt +++ b/test/ml/input/test-data-longName2/longName2.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction node3 node2 1 U node1 node3 1 U node5 node4 1 U diff --git a/test/ml/input/test-data-s1/s1.txt b/test/ml/input/test-data-s1/s1.txt index 031f4142..a8a52914 100644 --- a/test/ml/input/test-data-s1/s1.txt +++ b/test/ml/input/test-data-s1/s1.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 U C D 1 U E F 1 U \ No newline at end of file diff --git a/test/ml/input/test-data-s2/s2.txt b/test/ml/input/test-data-s2/s2.txt index 680bf369..d4e9860b 100644 --- a/test/ml/input/test-data-s2/s2.txt +++ b/test/ml/input/test-data-s2/s2.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 U C D 1 U E F 1 U diff --git a/test/ml/input/test-data-s3/s3.txt b/test/ml/input/test-data-s3/s3.txt index d06960f9..6884cfe8 100644 --- a/test/ml/input/test-data-s3/s3.txt +++ b/test/ml/input/test-data-s3/s3.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction L M 1 U M N 1 U O P 1 U diff --git a/test/ml/input/test-data-spaces/spaces.txt b/test/ml/input/test-data-spaces/spaces.txt index 0860d779..3565af81 100644 --- a/test/ml/input/test-data-spaces/spaces.txt +++ b/test/ml/input/test-data-spaces/spaces.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction L M 1 U O P 1 U nodes with spaces in name 1 U \ No newline at end of file diff --git a/test/ml/input/test-mixed-direction/mixed-direction.txt b/test/ml/input/test-mixed-direction/mixed-direction.txt index 6463ab3b..f77061a1 100644 --- a/test/ml/input/test-mixed-direction/mixed-direction.txt +++ b/test/ml/input/test-mixed-direction/mixed-direction.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction A B 1 D B A 1 D C D 1 U diff --git a/test/parse-outputs/expected/allpairs-pathway-expected.txt b/test/parse-outputs/expected/allpairs-pathway-expected.txt index ee3c198b..3af52bc6 100644 --- a/test/parse-outputs/expected/allpairs-pathway-expected.txt +++ b/test/parse-outputs/expected/allpairs-pathway-expected.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction S1 A 1 U S1 B 1 U A E 1 U diff --git a/test/parse-outputs/expected/domino-pathway-expected.txt b/test/parse-outputs/expected/domino-pathway-expected.txt index 3fb1c13a..074f1b20 100644 --- a/test/parse-outputs/expected/domino-pathway-expected.txt +++ b/test/parse-outputs/expected/domino-pathway-expected.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction ENSG00000122691 ENSG00000138757 1 U ENSG00000122691 ENSG00000109320 1 U ENSG00000134954 ENSG00000077150 1 U diff --git a/test/parse-outputs/expected/meo-pathway-expected.txt b/test/parse-outputs/expected/meo-pathway-expected.txt index 1971d419..6515013f 100644 --- a/test/parse-outputs/expected/meo-pathway-expected.txt +++ b/test/parse-outputs/expected/meo-pathway-expected.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction GENEA GENEC 1 D GENEC GENEB 1 D diff --git a/test/parse-outputs/expected/mincostflow-pathway-expected.txt b/test/parse-outputs/expected/mincostflow-pathway-expected.txt index cd60214e..b25d172b 100644 --- a/test/parse-outputs/expected/mincostflow-pathway-expected.txt +++ b/test/parse-outputs/expected/mincostflow-pathway-expected.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction B A 1 U D B 1 U diff --git a/test/parse-outputs/expected/omicsintegrator1-pathway-expected.txt b/test/parse-outputs/expected/omicsintegrator1-pathway-expected.txt index 16f30549..f808bc3a 100644 --- a/test/parse-outputs/expected/omicsintegrator1-pathway-expected.txt +++ b/test/parse-outputs/expected/omicsintegrator1-pathway-expected.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction A C 1 D C D 1 U diff --git a/test/parse-outputs/expected/omicsintegrator2-pathway-expected.txt b/test/parse-outputs/expected/omicsintegrator2-pathway-expected.txt index 65f6f221..e34eeaff 100644 --- a/test/parse-outputs/expected/omicsintegrator2-pathway-expected.txt +++ b/test/parse-outputs/expected/omicsintegrator2-pathway-expected.txt @@ -1,2 +1,3 @@ +Node1 Node2 Rank Direction B A 1 U B C 1 U diff --git a/test/parse-outputs/expected/pathlinker-pathway-expected.txt b/test/parse-outputs/expected/pathlinker-pathway-expected.txt index 9edabc0c..e490cd91 100644 --- a/test/parse-outputs/expected/pathlinker-pathway-expected.txt +++ b/test/parse-outputs/expected/pathlinker-pathway-expected.txt @@ -1,3 +1,4 @@ +Node1 Node2 Rank Direction S2 T3 1 D A E 2 D S1 A 2 D From 1b340bb7b0f22b55d093571cc185c696e2da8e05 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 22 Dec 2023 12:25:25 -0800 Subject: [PATCH 02/26] precommit --- spras/allpairs.py | 2 +- spras/analysis/ml.py | 2 +- spras/analysis/summary.py | 2 +- spras/domino.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 38f7f000..487cd0d8 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -110,7 +110,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param standardized_pathway_file: the same pathway written in the universal format """ df = pd.read_csv(raw_pathway_file, sep='\t', header=None) - + df['Rank'] = 1 # add a rank column of 1s since the edges are not ranked. df = reinsert_direction_col_undirected(df) df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 2c129873..98526ef1 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -57,7 +57,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra else: if direction == 'Direction': # if reading the header continue - else: + else: raise ValueError(f"direction is {direction}, rather than U or D") # getting the algorithm name diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index d0bc9582..4ea6595c 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -37,7 +37,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> # nw = nx.read_edgelist(file_path, data=(('weight', float), ('Direction',str))) if os.path.getsize(file_path) == 0: continue - else: + else: with open(file_path, 'r') as f: # skip the header line next(f) diff --git a/spras/domino.py b/spras/domino.py index 4666bf83..32136e97 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -207,7 +207,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): edges_df = reinsert_direction_col_undirected(edges_df) edges_df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] edges_df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False) - else: + else: edges_df.to_csv(standardized_pathway_file, sep='\t', header=None, index=False) From 5edf8a79fceabcb542d2c5eb727ef6ed32ee2627 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Sat, 23 Dec 2023 14:57:54 -0800 Subject: [PATCH 03/26] updated summary.py code --- spras/analysis/summary.py | 16 ++++++---------- test/analysis/output/example_summary.txt | 6 ++++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index 4ea6595c..aca03e72 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -33,16 +33,12 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> # Iterate through each network file path for file_path in sorted(file_paths): - nw = None - # nw = nx.read_edgelist(file_path, data=(('weight', float), ('Direction',str))) - if os.path.getsize(file_path) == 0: - continue - else: - with open(file_path, 'r') as f: - # skip the header line - next(f) - # Load in the network - nw = nx.read_edgelist(f, data=(('weight', float), ('Direction', str))) + + lines = None + with open(file_path, 'r') as f: + lines = f.readlines()[1:] # skip the first line + + nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) # Save the network name, number of nodes, number edges, and number of connected components nw_name = str(file_path) diff --git a/test/analysis/output/example_summary.txt b/test/analysis/output/example_summary.txt index fde30449..f34497a4 100644 --- a/test/analysis/output/example_summary.txt +++ b/test/analysis/output/example_summary.txt @@ -1,13 +1,19 @@ Name Number of nodes Number of undirected edges Number of connected components Nodes in sources Nodes in targets test/analysis/input/example/data0-meo-params-GKEDDFZ_pathway.txt 3 2 1 1 1 +test/analysis/input/example/data0-omicsintegrator1-params-JAZWLAK_pathway.txt 0 0 0 0 0 +test/analysis/input/example/data0-omicsintegrator1-params-PU62FNV_pathway.txt 0 0 0 0 0 test/analysis/input/example/data0-omicsintegrator1-params-RQCQ4YN_pathway.txt 3 2 1 1 1 test/analysis/input/example/data0-omicsintegrator1-params-WY4V42C_pathway.txt 3 2 1 1 1 +test/analysis/input/example/data0-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0 0 test/analysis/input/example/data0-omicsintegrator2-params-IV3IPCJ_pathway.txt 3 2 1 1 1 test/analysis/input/example/data0-pathlinker-params-6SWY7JS_pathway.txt 3 2 1 1 1 test/analysis/input/example/data0-pathlinker-params-VQL7BDZ_pathway.txt 3 2 1 1 1 test/analysis/input/example/data1-meo-params-GKEDDFZ_pathway.txt 4 4 1 1 2 test/analysis/input/example/data1-omicsintegrator1-params-JAZWLAK_pathway.txt 5 3 2 1 3 test/analysis/input/example/data1-omicsintegrator1-params-PU62FNV_pathway.txt 5 3 2 1 3 +test/analysis/input/example/data1-omicsintegrator1-params-RQCQ4YN_pathway.txt 0 0 0 0 0 +test/analysis/input/example/data1-omicsintegrator1-params-WY4V42C_pathway.txt 0 0 0 0 0 +test/analysis/input/example/data1-omicsintegrator2-params-EHHWPMD_pathway.txt 0 0 0 0 0 test/analysis/input/example/data1-omicsintegrator2-params-IV3IPCJ_pathway.txt 7 6 1 1 4 test/analysis/input/example/data1-pathlinker-params-6SWY7JS_pathway.txt 4 3 1 1 2 test/analysis/input/example/data1-pathlinker-params-VQL7BDZ_pathway.txt 4 3 1 1 2 From 733b73682e636e7f06415b7e34a35a49a1449cee Mon Sep 17 00:00:00 2001 From: ntalluri Date: Sat, 23 Dec 2023 14:58:39 -0800 Subject: [PATCH 04/26] precommit --- spras/analysis/summary.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index aca03e72..0a173521 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -33,11 +33,11 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> # Iterate through each network file path for file_path in sorted(file_paths): - + lines = None with open(file_path, 'r') as f: lines = f.readlines()[1:] # skip the first line - + nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) # Save the network name, number of nodes, number edges, and number of connected components From 3964789f1be34fdd08e1764daa05a0628854c354 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 18 Jan 2024 16:52:58 -0800 Subject: [PATCH 05/26] added changes to cytoscape --- docker-wrappers/Cytoscape/cytoscape_util.py | 4 ++-- spras/analysis/cytoscape.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker-wrappers/Cytoscape/cytoscape_util.py b/docker-wrappers/Cytoscape/cytoscape_util.py index c8f47922..1a7b8d76 100644 --- a/docker-wrappers/Cytoscape/cytoscape_util.py +++ b/docker-wrappers/Cytoscape/cytoscape_util.py @@ -117,8 +117,8 @@ def load_pathways(pathways: List[str], output: str) -> None: file=path, column_type_list='s,t,x,ea', delimiters='\t', - first_row_as_column_names = True, - start_load_row = 2, + first_row_as_column_names=True, + ) p4c.networks.rename_network(name, network=suid) diff --git a/spras/analysis/cytoscape.py b/spras/analysis/cytoscape.py index f39fff6d..82c35fc6 100644 --- a/spras/analysis/cytoscape.py +++ b/spras/analysis/cytoscape.py @@ -51,7 +51,7 @@ def run_cytoscape(pathways: List[Union[str, PurePath]], output_file: str, singul # TODO consider making this a string in the config file instead of a Boolean container_framework = 'singularity' if singularity else 'docker' out = run_container(container_framework, - 'reedcompbio/py4cytoscape:v2', + 'reedcompbio/py4cytoscape:v3', command, volumes, work_dir, From 9d122e1c36a195962d1408b443ad1a26290a12e2 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Wed, 24 Jan 2024 14:10:07 -0600 Subject: [PATCH 06/26] precommit --- docker-wrappers/Cytoscape/cytoscape_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-wrappers/Cytoscape/cytoscape_util.py b/docker-wrappers/Cytoscape/cytoscape_util.py index 1a7b8d76..cc8ddd72 100644 --- a/docker-wrappers/Cytoscape/cytoscape_util.py +++ b/docker-wrappers/Cytoscape/cytoscape_util.py @@ -118,7 +118,7 @@ def load_pathways(pathways: List[str], output: str) -> None: column_type_list='s,t,x,ea', delimiters='\t', first_row_as_column_names=True, - + ) p4c.networks.rename_network(name, network=suid) From ada6cde76c8548b78461496e219c1bd209e06ad8 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 12:56:06 -0600 Subject: [PATCH 07/26] update config --- config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.yaml b/config/config.yaml index b85c599b..4517e3ef 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -37,7 +37,7 @@ container_registry: algorithms: - name: "pathlinker" params: - include: false + include: true run1: k: range(100,201,100) From 0f1cba797bb8ab889ce574098eb86c6f774369ad Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 14:13:25 -0600 Subject: [PATCH 08/26] review --- .github/workflows/test-spras.yml | 4 ++-- docker-wrappers/Cytoscape/README.md | 1 + spras/analysis/ml.py | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index 3dc2ab85..ec252c8f 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -152,8 +152,8 @@ jobs: path: docker-wrappers/Cytoscape/. dockerfile: docker-wrappers/Cytoscape/Dockerfile repository: reedcompbio/py4cytoscape - tags: v2 - cache_froms: reedcompbio/py4cytoscape:v2 + tags: v3 + cache_froms: reedcompbio/py4cytoscape:v3 push: false # Run pre-commit checks on source files diff --git a/docker-wrappers/Cytoscape/README.md b/docker-wrappers/Cytoscape/README.md index 2d747ffd..c3cb8967 100644 --- a/docker-wrappers/Cytoscape/README.md +++ b/docker-wrappers/Cytoscape/README.md @@ -20,6 +20,7 @@ The Docker wrapper can be tested with `pytest`. ## Versions: - v1: Use supervisord to launch Cytoscape from a Python subprocess, then connect to Cytoscape with py4cytoscape. Only loads undirected pathways. Compatible with Singularity in local testing (Apptainer version 1.2.2-1.el7) but fails in GitHub Actions. - v2: Add support for edge direction column. +- v3: Add support for header lines in files ## TODO - Add an auth file for `xvfb-run` diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 98526ef1..aa2bdb9b 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -44,7 +44,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra edges = [] for line in lines: parts = line.split('\t') - if len(parts) >= 4: # in case of empty line in file or line doesn't include all values + if len(parts) == 4: node1 = parts[0] node2 = parts[1] direction = str(parts[3]).strip() @@ -59,7 +59,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra continue else: raise ValueError(f"direction is {direction}, rather than U or D") - + elif len(parts) > 4 and len(parts) > 0: # empty line in file is okay + raise ValueError(f"A line in pathway file {file} contains {len(parts)} values. There should be 4 values per line") # getting the algorithm name p = PurePath(file) edge_tuples.append((p.parts[-2], edges)) From a9de3edc0c91abc28c903d04eeadc934db34b6be Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 14:29:50 -0600 Subject: [PATCH 09/26] update ml --- spras/analysis/ml.py | 2 +- test/ml/input/test-data-empty-line/emptyLine.txt | 4 ++++ .../less.txt | 5 +++++ .../more.txt | 5 +++++ test/ml/test_ml.py | 15 ++++++++++++++- 5 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 test/ml/input/test-data-empty-line/emptyLine.txt create mode 100644 test/ml/input/test-data-unexpected-amount-of-values/less.txt create mode 100644 test/ml/input/test-data-unexpected-amount-of-values/more.txt diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index aa2bdb9b..d4316750 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -59,7 +59,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra continue else: raise ValueError(f"direction is {direction}, rather than U or D") - elif len(parts) > 4 and len(parts) > 0: # empty line in file is okay + elif len(parts) > 4 or (len(parts) < 4 and len(parts) > 0): # empty line in file is okay raise ValueError(f"A line in pathway file {file} contains {len(parts)} values. There should be 4 values per line") # getting the algorithm name p = PurePath(file) diff --git a/test/ml/input/test-data-empty-line/emptyLine.txt b/test/ml/input/test-data-empty-line/emptyLine.txt new file mode 100644 index 00000000..144f2487 --- /dev/null +++ b/test/ml/input/test-data-empty-line/emptyLine.txt @@ -0,0 +1,4 @@ +Node1 Node2 Rank Direction +A B 1 U + +E F 1 U \ No newline at end of file diff --git a/test/ml/input/test-data-unexpected-amount-of-values/less.txt b/test/ml/input/test-data-unexpected-amount-of-values/less.txt new file mode 100644 index 00000000..b6d268a1 --- /dev/null +++ b/test/ml/input/test-data-unexpected-amount-of-values/less.txt @@ -0,0 +1,5 @@ +Node1 Node2 Rank Direction +A B +C D 1 +E +L M 1 U \ No newline at end of file diff --git a/test/ml/input/test-data-unexpected-amount-of-values/more.txt b/test/ml/input/test-data-unexpected-amount-of-values/more.txt new file mode 100644 index 00000000..f43c2d7d --- /dev/null +++ b/test/ml/input/test-data-unexpected-amount-of-values/more.txt @@ -0,0 +1,5 @@ +Node1 Node2 Rank Direction +A B 1 U B B +C D 1 U B B B B +E F 1 U B +L M 1 U \ No newline at end of file diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index fa868a59..5641c282 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -1,6 +1,6 @@ import filecmp from pathlib import Path - +import pytest import pandas as pd import spras.analysis.ml as ml @@ -25,6 +25,19 @@ def test_summarize_networks(self): dataframe.to_csv(OUT_DIR + 'dataframe.csv') assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False) + + def test_summarize_networks_less_values(self): + with pytest.raises(ValueError): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-unexpected-amount-of-values/less.txt']) + + def test_summarize_networks_more_values(self): + with pytest.raises(ValueError): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-unexpected-amount-of-values/more.txt']) + + def test_summarize_networks_empty_line(self): + with pytest.raises(ValueError): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty-line/emptyLine.txt']) + def test_pca(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', From 6a9ea4a6cb3754efb9f72d32c0066f0073172523 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 14:57:16 -0600 Subject: [PATCH 10/26] update ml and test cases --- spras/analysis/ml.py | 7 ++---- test/ml/expected/expected-dataframe.csv | 2 +- .../mixed-direction.txt | 0 .../wrong-direction.txt | 5 +++++ test/ml/test_ml.py | 22 +++++++++++-------- 5 files changed, 21 insertions(+), 15 deletions(-) rename test/ml/input/{test-mixed-direction => test-data-mixed-direction}/mixed-direction.txt (100%) create mode 100644 test/ml/input/test-data-wrong-direction/wrong-direction.txt diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index d4316750..f9ec5eb6 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -54,11 +54,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra elif direction == "D": # node order does matter for directed edges edges.append(DIR_CONST.join([node1, node2])) - else: - if direction == 'Direction': # if reading the header - continue - else: - raise ValueError(f"direction is {direction}, rather than U or D") + elif direction != 'Direction' and direction != 'U' and direction != 'D': + raise ValueError(f"direction is {direction}, rather than U or D") elif len(parts) > 4 or (len(parts) < 4 and len(parts) > 0): # empty line in file is okay raise ValueError(f"A line in pathway file {file} contains {len(parts)} values. There should be 4 values per line") # getting the algorithm name diff --git a/test/ml/expected/expected-dataframe.csv b/test/ml/expected/expected-dataframe.csv index b594efb4..15cff881 100644 --- a/test/ml/expected/expected-dataframe.csv +++ b/test/ml/expected/expected-dataframe.csv @@ -1,4 +1,4 @@ -,test-data-s1,test-data-s2,test-data-s3,test-data-longName,test-data-longName2,test-data-empty,test-data-spaces,test-mixed-direction +,test-data-s1,test-data-s2,test-data-s3,test-data-longName,test-data-longName2,test-data-empty,test-data-spaces,test-data-mixed-direction A---B,1,1,0,0,0,0,0,0 C---D,1,1,0,0,0,0,0,1 E---F,1,1,0,0,0,0,0,1 diff --git a/test/ml/input/test-mixed-direction/mixed-direction.txt b/test/ml/input/test-data-mixed-direction/mixed-direction.txt similarity index 100% rename from test/ml/input/test-mixed-direction/mixed-direction.txt rename to test/ml/input/test-data-mixed-direction/mixed-direction.txt diff --git a/test/ml/input/test-data-wrong-direction/wrong-direction.txt b/test/ml/input/test-data-wrong-direction/wrong-direction.txt new file mode 100644 index 00000000..dc45581c --- /dev/null +++ b/test/ml/input/test-data-wrong-direction/wrong-direction.txt @@ -0,0 +1,5 @@ +Node1 Node2 Rank Direction +A B 1 D +B A 1 D +C D 1 B +E F 1 U \ No newline at end of file diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 5641c282..99f8e065 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -1,7 +1,8 @@ import filecmp from pathlib import Path -import pytest + import pandas as pd +import pytest import spras.analysis.ml as ml @@ -21,22 +22,25 @@ def setup_class(cls): def test_summarize_networks(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt', INPUT_DIR + 'test-data-longName/longName.txt', INPUT_DIR + 'test-data-longName2/longName2.txt', - INPUT_DIR + 'test-data-empty/empty.txt', INPUT_DIR + 'test-data-spaces/spaces.txt', INPUT_DIR + 'test-mixed-direction/mixed-direction.txt']) + INPUT_DIR + 'test-data-empty/empty.txt', INPUT_DIR + 'test-data-spaces/spaces.txt', INPUT_DIR + 'test-data-mixed-direction/mixed-direction.txt']) dataframe.to_csv(OUT_DIR + 'dataframe.csv') assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False) - def test_summarize_networks_less_values(self): with pytest.raises(ValueError): - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-unexpected-amount-of-values/less.txt']) - + ml.summarize_networks([INPUT_DIR + 'test-data-unexpected-amount-of-values/less.txt']) + def test_summarize_networks_more_values(self): with pytest.raises(ValueError): - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-unexpected-amount-of-values/more.txt']) - + ml.summarize_networks([INPUT_DIR + 'test-data-unexpected-amount-of-values/more.txt']) + def test_summarize_networks_empty_line(self): with pytest.raises(ValueError): - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-empty-line/emptyLine.txt']) + ml.summarize_networks([INPUT_DIR + 'test-data-empty-line/emptyLine.txt']) + + def test_summarize_networks_wrong_direction(self): + with pytest.raises(ValueError): + ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt']) def test_pca(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) @@ -62,7 +66,7 @@ def test_hac_vertical(self): assert filecmp.cmp(OUT_DIR + 'hac-clusters-vertical.txt', EXPECT_DIR + 'expected-hac-vertical-clusters.txt', shallow=False) def test_ensemble_network(self): - dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt', INPUT_DIR + 'test-mixed-direction/mixed-direction.txt']) + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt', INPUT_DIR + 'test-data-mixed-direction/mixed-direction.txt']) ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network.tsv') en = pd.read_table(OUT_DIR + 'ensemble-network.tsv') From f9e989e2c7889b3ecba515403eb5cefd649a46ef Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 9 Feb 2024 15:24:59 -0600 Subject: [PATCH 11/26] update contributing guide --- CONTRIBUTING.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a264c5f5..2ec87b01 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -156,7 +156,8 @@ Implement the `parse_output` function. The edges in the Local Neighborhood output have the same format as the input, `|`. Convert these to be tab-separated vertex pairs followed by a tab and a `1` at the end of every line, which indicates all edges have the same rank. See the `add_rank_column` function in `src.util.py`. -The output should have the format ` 1`. +Make sure header = True when the file is created. +The output should have the format ` 1 U`. ### Step 4: Make the Local Neighborhood wrapper accessible through SPRAS Import the new class `LocalNeighborhood` in `src/runner.py` so the wrapper functions can be accessed. From 278b761b7a085fa33281b77f0998c10e97be36c1 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 12 Mar 2024 16:04:11 -0500 Subject: [PATCH 12/26] ml changes --- spras/analysis/ml.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index f9ec5eb6..206b5a97 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -41,10 +41,13 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra with open(file, 'r') as f: lines = f.readlines() + if len(lines) > 0: + lines.pop(0) # process header line + edges = [] for line in lines: parts = line.split('\t') - if len(parts) == 4: + if len(parts) == 4: # empty lines not allowed but empty files are allowed node1 = parts[0] node2 = parts[1] direction = str(parts[3]).strip() @@ -56,8 +59,9 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra edges.append(DIR_CONST.join([node1, node2])) elif direction != 'Direction' and direction != 'U' and direction != 'D': raise ValueError(f"direction is {direction}, rather than U or D") - elif len(parts) > 4 or (len(parts) < 4 and len(parts) > 0): # empty line in file is okay - raise ValueError(f"A line in pathway file {file} contains {len(parts)} values. There should be 4 values per line") + elif len(parts) != 0: + raise ValueError(f"In file {file}, expected line {line} to have 4 values, but found {len(parts)} values.") + # getting the algorithm name p = PurePath(file) edge_tuples.append((p.parts[-2], edges)) From 4dfd018700bae11ecf2b729879f4c7533afaaf27 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 12 Mar 2024 16:24:51 -0500 Subject: [PATCH 13/26] precommit to ml --- spras/analysis/ml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 206b5a97..23b17b87 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -42,8 +42,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra lines = f.readlines() if len(lines) > 0: - lines.pop(0) # process header line - + lines.pop(0) # process header line + edges = [] for line in lines: parts = line.split('\t') @@ -61,7 +61,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra raise ValueError(f"direction is {direction}, rather than U or D") elif len(parts) != 0: raise ValueError(f"In file {file}, expected line {line} to have 4 values, but found {len(parts)} values.") - + # getting the algorithm name p = PurePath(file) edge_tuples.append((p.parts[-2], edges)) From c6da91a98622178925e12c17c752e76513733fae Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 12 Mar 2024 17:25:19 -0500 Subject: [PATCH 14/26] attempting error checking for empty df read from rpw --- spras/allpairs.py | 12 ++++---- spras/domino.py | 3 +- spras/meo.py | 20 +++++++------ spras/mincostflow.py | 14 ++++----- spras/omicsintegrator1.py | 60 ++++++++++++++++++++++++++++++--------- spras/omicsintegrator2.py | 20 +++++++------ spras/pathlinker.py | 14 +++++++-- spras/util.py | 11 +++++++ 8 files changed, 107 insertions(+), 47 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 0a276e92..e54cf9f7 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -9,6 +9,7 @@ reinsert_direction_col_undirected, ) from spras.prm import PRM +from spras.util import add_rank_column, raw_pathway_df __all__ = ['AllPairs'] @@ -111,9 +112,10 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - df = pd.read_csv(raw_pathway_file, sep='\t', header=None) - - df['Rank'] = 1 # add a rank column of 1s since the edges are not ranked. - df = reinsert_direction_col_undirected(df) - df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] + # df = pd.read_csv(raw_pathway_file, sep='\t', header=None) + df = raw_pathway_df(raw_pathway_file, header = None) + if not df.empty: + df = add_rank_column(df) + df = reinsert_direction_col_undirected(df) + df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/domino.py b/spras/domino.py index 594041b7..9e6b256e 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -208,7 +208,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file): edges_df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] edges_df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False) else: - edges_df.to_csv(standardized_pathway_file, sep='\t', header=None, index=False) + df = pd.DataFrame(columns = ['Node1', 'Node2', 'Rank', 'Direction']) + df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False) def pre_domino_id_transform(node_id): diff --git a/spras/meo.py b/spras/meo.py index d111d6a3..7a70119b 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -8,7 +8,7 @@ reinsert_direction_col_directed, ) from spras.prm import PRM -from spras.util import add_rank_column +from spras.util import add_rank_column, raw_pathway_df __all__ = ['MEO', 'write_properties'] @@ -181,13 +181,15 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param standardized_pathway_file: the same pathway written in the universal format """ # Columns Source Type Target Oriented Weight - df = pd.read_csv(raw_pathway_file, sep='\t') + # df = pd.read_csv(raw_pathway_file, sep='\t', header=0) + df = raw_pathway_df(raw_pathway_file, header=0) + if not df.empty: # Keep only edges that were assigned an orientation (direction) - df = df.loc[df['Oriented']] - # TODO what should be the edge rank? - # Would need to load the paths output file to rank edges correctly - df = add_rank_column(df) - df = reinsert_direction_col_directed(df) - df.drop(columns=['Type', 'Oriented', 'Weight'], inplace = True) - df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df = df.loc[df['Oriented']] + # TODO what should be the edge rank? + # Would need to load the paths output file to rank edges correctly + df = add_rank_column(df) + df = reinsert_direction_col_directed(df) + df.drop(columns=['Type', 'Oriented', 'Weight'], inplace = True) + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 44575038..f04b8034 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -8,7 +8,7 @@ reinsert_direction_col_undirected, ) from spras.prm import PRM -from spras.util import add_rank_column +from spras.util import add_rank_column, raw_pathway_df __all__ = ['MinCostFlow'] @@ -150,11 +150,11 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param standardized_pathway_file: the same pathway written in the universal format """ - df = pd.read_csv(raw_pathway_file, sep='\t', header=None) - df = add_rank_column(df) - # TODO update MinCostFlow version to support mixed graphs - # Currently directed edges in the input will be converted to undirected edges in the output - df = reinsert_direction_col_undirected(df) - df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df = raw_pathway_df(raw_pathway_file, header = None) + # df = pd.read_csv(raw_pathway_file, sep='\t', header=None) + if not df.empty: + df = add_rank_column(df) + df = reinsert_direction_col_undirected(df) + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 2abda226..3e25cde2 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,3 +1,4 @@ +import os from pathlib import Path import pandas as pd @@ -5,7 +6,7 @@ from spras.containers import prepare_volume, run_container from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM -from spras.util import add_rank_column +from spras.util import add_rank_column, raw_pathway_df __all__ = ['OmicsIntegrator1', 'write_conf'] @@ -191,16 +192,49 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # I'm assuming from having read the documentation that we will be passing in optimalForest.sif # as raw_pathway_file, in which case the format should be edge1 interactiontype edge2. # if that assumption is wrong we will need to tweak things - try: - df = pd.read_csv(raw_pathway_file, sep='\t', header=None) - except pd.errors.EmptyDataError: - with open(standardized_pathway_file, 'w'): - pass - return - - df.columns = ["Edge1", "InteractionType", "Edge2"] - df = add_rank_column(df) - df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") - df.drop(columns=['InteractionType'], inplace = True) - df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + # try: + # # check_file = os.stat(raw_pathway_file).st_size + + # # if(check_file == 0): + # # print("The file is empty.") + # # df = pd.DataFrame(columns = ['Edge1', 'Edge2','Rank','Direction']) + + # # else: + # df = pd.read_csv(raw_pathway_file, sep='\t', header=None) + # df.columns = ["Edge1", "InteractionType", "Edge2"] + # df = add_rank_column(df) + # df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") + # df.drop(columns=['InteractionType'], inplace = True) + # df.columns = ['Node1', 'Node2','Rank','Direction'] + + # except pd.errors.EmptyDataError: + # print("we hit empty thingy") + # df = pd.DataFrame(columns = ['Node1', 'Node2','Rank','Direction']) + # # with open(standardized_pathway_file, 'w'): + # # pass + # # return + + df = raw_pathway_df(raw_pathway_file, header=None) + if not df.empty: + df.columns = ["Edge1", "InteractionType", "Edge2"] + df = add_rank_column(df) + df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") + df.drop(columns=['InteractionType'], inplace = True) + df.columns = ['Node1', 'Node2','Rank','Direction'] + + # if df.empty: + # print("THE DATA FRAME IS EMPTY") + # print(df) + # else: + # print("THE DF IS NOT EMPTY") + + # df.columns = ["Edge1", "InteractionType", "Edge2"] + # df = add_rank_column(df) + # df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") + # df.drop(columns=['InteractionType'], inplace = True) + + + + print(df) + print(df.columns) df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 4083d0ab..6734aa06 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -6,7 +6,7 @@ from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_undirected from spras.prm import PRM -from spras.util import add_rank_column +from spras.util import add_rank_column, raw_pathway_df __all__ = ['OmicsIntegrator2'] @@ -149,13 +149,15 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # Omicsintegrator2 returns a single line file if no network is found num_lines = sum(1 for line in open(raw_pathway_file)) if num_lines < 2: - with open(standardized_pathway_file, 'w'): - pass + df = pd.DataFrame(columns = ['Node1', 'Node2', 'Rank', 'Direction']) + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') return - df = pd.read_csv(raw_pathway_file, sep='\t') - df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line - df = df.take([0, 1], axis=1) - df = add_rank_column(df) - df = reinsert_direction_col_undirected(df) - df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df = raw_pathway_df(raw_pathway_file, header=0 ) + # df = pd.read_csv(raw_pathway_file, sep='\t', header = 0) + # df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line + if not df.empty: + df = df.take([0, 1], axis=1) + df = add_rank_column(df) + df = reinsert_direction_col_undirected(df) + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/pathlinker.py b/spras/pathlinker.py index c0bd7ed8..4ffc3246 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -9,6 +9,7 @@ reinsert_direction_col_directed, ) from spras.prm import PRM +from spras.util import raw_pathway_df __all__ = ['PathLinker'] @@ -137,7 +138,14 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param standardized_pathway_file: the same pathway written in the universal format """ # What about multiple raw_pathway_files - df = pd.read_csv(raw_pathway_file, sep='\t').take([0, 1, 2], axis=1) - df = reinsert_direction_col_directed(df) - df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + # try: + # df = pd.read_csv(raw_pathway_file, sep='\t') + + # except pd.errors.EmptyDataError: + # print("we hit empty thingy") + df = raw_pathway_df(raw_pathway_file, header = 0) + if not df.empty: + df = df.take([0, 1, 2], axis=1) + df = reinsert_direction_col_directed(df) + df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/util.py b/spras/util.py index ea6cd952..9b51ac5a 100644 --- a/spras/util.py +++ b/spras/util.py @@ -59,3 +59,14 @@ def add_rank_column(df: pd.DataFrame) -> pd.DataFrame: """ df['Rank'] = 1 return df + +def raw_pathway_df(raw_pathway_file: str, header:int= None) -> pd.DataFrame: + """ + creates df from contents in raw pathway file, otherwise returns an empty df + """ + try: + df = pd.read_csv(raw_pathway_file, sep='\t', header=header) + except pd.errors.EmptyDataError: # read an empty file + df = pd.DataFrame(columns = ['Node1', 'Node2','Rank','Direction']) + + return df From c183abf903f3fce9bc1ee3b7043f1054fac5fbcb Mon Sep 17 00:00:00 2001 From: ntalluri Date: Wed, 13 Mar 2024 15:58:59 -0500 Subject: [PATCH 15/26] cleaned up code --- spras/allpairs.py | 3 +-- spras/meo.py | 2 -- spras/mincostflow.py | 3 +-- spras/omicsintegrator1.py | 37 +------------------------------------ spras/omicsintegrator2.py | 15 +++++++-------- spras/pathlinker.py | 8 +------- spras/util.py | 3 +++ 7 files changed, 14 insertions(+), 57 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index e54cf9f7..cdce7e40 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -112,8 +112,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - # df = pd.read_csv(raw_pathway_file, sep='\t', header=None) - df = raw_pathway_df(raw_pathway_file, header = None) + df = raw_pathway_df(raw_pathway_file, header=None) if not df.empty: df = add_rank_column(df) df = reinsert_direction_col_undirected(df) diff --git a/spras/meo.py b/spras/meo.py index 7a70119b..16fae07e 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -180,8 +180,6 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - # Columns Source Type Target Oriented Weight - # df = pd.read_csv(raw_pathway_file, sep='\t', header=0) df = raw_pathway_df(raw_pathway_file, header=0) if not df.empty: # Keep only edges that were assigned an orientation (direction) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index f04b8034..531ae532 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -150,8 +150,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param standardized_pathway_file: the same pathway written in the universal format """ - df = raw_pathway_df(raw_pathway_file, header = None) - # df = pd.read_csv(raw_pathway_file, sep='\t', header=None) + df = raw_pathway_df(raw_pathway_file, header=None) if not df.empty: df = add_rank_column(df) df = reinsert_direction_col_undirected(df) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 3e25cde2..da4c5459 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -192,27 +192,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # I'm assuming from having read the documentation that we will be passing in optimalForest.sif # as raw_pathway_file, in which case the format should be edge1 interactiontype edge2. # if that assumption is wrong we will need to tweak things - # try: - # # check_file = os.stat(raw_pathway_file).st_size - - # # if(check_file == 0): - # # print("The file is empty.") - # # df = pd.DataFrame(columns = ['Edge1', 'Edge2','Rank','Direction']) - - # # else: - # df = pd.read_csv(raw_pathway_file, sep='\t', header=None) - # df.columns = ["Edge1", "InteractionType", "Edge2"] - # df = add_rank_column(df) - # df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") - # df.drop(columns=['InteractionType'], inplace = True) - # df.columns = ['Node1', 'Node2','Rank','Direction'] - - # except pd.errors.EmptyDataError: - # print("we hit empty thingy") - # df = pd.DataFrame(columns = ['Node1', 'Node2','Rank','Direction']) - # # with open(standardized_pathway_file, 'w'): - # # pass - # # return + df = raw_pathway_df(raw_pathway_file, header=None) if not df.empty: @@ -222,19 +202,4 @@ def parse_output(raw_pathway_file, standardized_pathway_file): df.drop(columns=['InteractionType'], inplace = True) df.columns = ['Node1', 'Node2','Rank','Direction'] - # if df.empty: - # print("THE DATA FRAME IS EMPTY") - # print(df) - # else: - # print("THE DF IS NOT EMPTY") - - # df.columns = ["Edge1", "InteractionType", "Edge2"] - # df = add_rank_column(df) - # df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") - # df.drop(columns=['InteractionType'], inplace = True) - - - - print(df) - print(df.columns) df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 6734aa06..e1197490 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -150,14 +150,13 @@ def parse_output(raw_pathway_file, standardized_pathway_file): num_lines = sum(1 for line in open(raw_pathway_file)) if num_lines < 2: df = pd.DataFrame(columns = ['Node1', 'Node2', 'Rank', 'Direction']) - df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') - return - df = raw_pathway_df(raw_pathway_file, header=0 ) - # df = pd.read_csv(raw_pathway_file, sep='\t', header = 0) - # df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line - if not df.empty: - df = df.take([0, 1], axis=1) - df = add_rank_column(df) + else: + df = pd.read_csv(raw_pathway_file, sep='\t', header=0) + df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line + df = add_rank_column(df) df = reinsert_direction_col_undirected(df) + df.drop(columns=['cost', 'in_solution'], inplace = True) df.columns = ['Node1', 'Node2', 'Rank', "Direction"] + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') + diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 4ffc3246..97ee64ad 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -137,13 +137,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - # What about multiple raw_pathway_files - # try: - # df = pd.read_csv(raw_pathway_file, sep='\t') - - # except pd.errors.EmptyDataError: - # print("we hit empty thingy") - df = raw_pathway_df(raw_pathway_file, header = 0) + df = raw_pathway_df(raw_pathway_file, header=0) if not df.empty: df = df.take([0, 1, 2], axis=1) df = reinsert_direction_col_directed(df) diff --git a/spras/util.py b/spras/util.py index 9b51ac5a..94c7ab2f 100644 --- a/spras/util.py +++ b/spras/util.py @@ -63,6 +63,9 @@ def add_rank_column(df: pd.DataFrame) -> pd.DataFrame: def raw_pathway_df(raw_pathway_file: str, header:int= None) -> pd.DataFrame: """ creates df from contents in raw pathway file, otherwise returns an empty df + @param raw_pathway_file: the specific path to the raw_pathway_file to read from + @param header: what row the header is, otherwise None + """ try: df = pd.read_csv(raw_pathway_file, sep='\t', header=header) From 01ad3422eaf88b508cecc526515a29a83c9f05bc Mon Sep 17 00:00:00 2001 From: ntalluri Date: Wed, 13 Mar 2024 16:00:44 -0500 Subject: [PATCH 16/26] precommit --- spras/omicsintegrator1.py | 2 +- spras/omicsintegrator2.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index da4c5459..4c5e5164 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -192,7 +192,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # I'm assuming from having read the documentation that we will be passing in optimalForest.sif # as raw_pathway_file, in which case the format should be edge1 interactiontype edge2. # if that assumption is wrong we will need to tweak things - + df = raw_pathway_df(raw_pathway_file, header=None) if not df.empty: diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index e1197490..b1419258 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -150,13 +150,13 @@ def parse_output(raw_pathway_file, standardized_pathway_file): num_lines = sum(1 for line in open(raw_pathway_file)) if num_lines < 2: df = pd.DataFrame(columns = ['Node1', 'Node2', 'Rank', 'Direction']) - else: + else: df = pd.read_csv(raw_pathway_file, sep='\t', header=0) df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line - df = add_rank_column(df) + df = add_rank_column(df) df = reinsert_direction_col_undirected(df) df.drop(columns=['cost', 'in_solution'], inplace = True) df.columns = ['Node1', 'Node2', 'Rank', "Direction"] - + df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') - + From 129532c08c36e88e87f9d8bc61f40fe50b053b91 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 18 Mar 2024 13:49:04 -0500 Subject: [PATCH 17/26] clean up new util func, add new test, add to contributing guide --- CONTRIBUTING.md | 4 ++-- spras/omicsintegrator2.py | 4 ++-- spras/util.py | 7 ++++--- test/parse-outputs/expected/empty-pathway-expected.txt | 1 + test/parse-outputs/input/empty-raw-pathway.txt | 0 test/parse-outputs/test_parse_outputs.py | 10 ++++++++++ 6 files changed, 19 insertions(+), 7 deletions(-) create mode 100644 test/parse-outputs/expected/empty-pathway-expected.txt create mode 100644 test/parse-outputs/input/empty-raw-pathway.txt diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2ec87b01..c4452f7a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -155,8 +155,8 @@ Use the `run_container` utility function to run the command in the container `|`. Convert these to be tab-separated vertex pairs followed by a tab and a `1` at the end of every line, which indicates all edges have the same rank. -See the `add_rank_column` function in `src.util.py`. -Make sure header = True when the file is created. +See the `add_rank_column` and `raw_pathway_df` function in `src.util.py`. +Make sure header = True with column names: ['Node1', 'Node2', 'Rank', 'Direction'] when the file is created The output should have the format ` 1 U`. ### Step 4: Make the Local Neighborhood wrapper accessible through SPRAS diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index b1419258..2d8dee3c 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -152,10 +152,10 @@ def parse_output(raw_pathway_file, standardized_pathway_file): df = pd.DataFrame(columns = ['Node1', 'Node2', 'Rank', 'Direction']) else: df = pd.read_csv(raw_pathway_file, sep='\t', header=0) - df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line + df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line + df = df.take([0, 1], axis=1) df = add_rank_column(df) df = reinsert_direction_col_undirected(df) - df.drop(columns=['cost', 'in_solution'], inplace = True) df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/util.py b/spras/util.py index 94c7ab2f..efa9bc73 100644 --- a/spras/util.py +++ b/spras/util.py @@ -62,9 +62,10 @@ def add_rank_column(df: pd.DataFrame) -> pd.DataFrame: def raw_pathway_df(raw_pathway_file: str, header:int= None) -> pd.DataFrame: """ - creates df from contents in raw pathway file, otherwise returns an empty df - @param raw_pathway_file: the specific path to the raw_pathway_file to read from - @param header: what row the header is, otherwise None + Creates DF from contents in raw pathway file, + otherwise returns an empty DF with standard output column names + @param raw_pathway_file: path to raw_pathway_file + @param header: what row the header is in raw_pathway_file, otherwise None """ try: diff --git a/test/parse-outputs/expected/empty-pathway-expected.txt b/test/parse-outputs/expected/empty-pathway-expected.txt new file mode 100644 index 00000000..a1a76651 --- /dev/null +++ b/test/parse-outputs/expected/empty-pathway-expected.txt @@ -0,0 +1 @@ +Node1 Node2 Rank Direction diff --git a/test/parse-outputs/input/empty-raw-pathway.txt b/test/parse-outputs/input/empty-raw-pathway.txt new file mode 100644 index 00000000..e69de29b diff --git a/test/parse-outputs/test_parse_outputs.py b/test/parse-outputs/test_parse_outputs.py index 8d8d0933..0f471e5d 100644 --- a/test/parse-outputs/test_parse_outputs.py +++ b/test/parse-outputs/test_parse_outputs.py @@ -1,6 +1,9 @@ import filecmp from pathlib import Path +import pandas as pd +import pytest + from spras import runner INDIR = "test/parse-outputs/input/" @@ -29,3 +32,10 @@ def test_parse_outputs(self): runner.parse_output(algo, test_file, out_file) assert filecmp.cmp(OUTDIR + f"{algo}-pathway.txt", EXPDIR + f"{algo}-pathway-expected.txt", shallow=False) + + def test_empty_file(self): + for algo in algorithms: + test_file = INDIR + f"empty-raw-pathway.txt" + out_file = OUTDIR + f"{algo}-empty-pathway.txt" + runner.parse_output(algo, test_file, out_file) + assert filecmp.cmp(OUTDIR + f"{algo}-empty-pathway.txt", EXPDIR + f"empty-pathway-expected.txt", shallow=False) From bbd907584c535056daa7e6e6d6dbd1a89e424922 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 18 Mar 2024 14:06:58 -0500 Subject: [PATCH 18/26] trying to fix error --- spras/meo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/meo.py b/spras/meo.py index 16fae07e..c854a64f 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -190,4 +190,4 @@ def parse_output(raw_pathway_file, standardized_pathway_file): df = reinsert_direction_col_directed(df) df.drop(columns=['Type', 'Oriented', 'Weight'], inplace = True) df.columns = ['Node1', 'Node2', 'Rank', "Direction"] - df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') + df.to_csv(standardized_pathway_file, index=False, sep='\t', header=True) From 0f4510c9824c1ef5f5d3608835d882efac155f5f Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 21 Mar 2024 12:16:02 -0500 Subject: [PATCH 19/26] testing mcf tester with macos-latest --- .github/workflows/test-spras.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index ec252c8f..d6e13e9c 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -32,7 +32,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest] + os: [macos-latest] steps: - name: Checkout repository uses: actions/checkout@v2 From 36d556b22b3257b590e1623edcb1a5bf600d6c67 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Thu, 21 Mar 2024 12:29:43 -0500 Subject: [PATCH 20/26] revert --- .github/workflows/test-spras.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index d6e13e9c..ec252c8f 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -32,7 +32,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [macos-latest] + os: [ubuntu-latest] steps: - name: Checkout repository uses: actions/checkout@v2 From f5b880bd7c396a4a099c46563b9a840a4bb8f4b3 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 14 Jun 2024 10:36:14 -0500 Subject: [PATCH 21/26] updated contributing guide --- CONTRIBUTING.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c4452f7a..a943cacb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -154,9 +154,9 @@ Use the `run_container` utility function to run the command in the container `|`. -Convert these to be tab-separated vertex pairs followed by a tab and a `1` at the end of every line, which indicates all edges have the same rank. -See the `add_rank_column` and `raw_pathway_df` function in `src.util.py`. -Make sure header = True with column names: ['Node1', 'Node2', 'Rank', 'Direction'] when the file is created +Convert these to be tab-separated vertex pairs followed by a tab `1` and tab `U` at the end of every line, which indicates all edges have the same rank and are undirected. +See the `add_rank_column` and `raw_pathway_df` function in `src.util.py` and `reinsert_direction_col_undirected` function in `src.interactome.py`. +Make sure header = True with column names: ['Node1', 'Node2', 'Rank', 'Direction'] when the file is created. The output should have the format ` 1 U`. ### Step 4: Make the Local Neighborhood wrapper accessible through SPRAS From 322bfa560a66d293173afb553ad4df2c8d6f30e8 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Fri, 14 Jun 2024 10:42:50 -0500 Subject: [PATCH 22/26] updated new ML test files to include headers --- test/ml/input/test-data-empty/empty.txt | 1 + test/ml/input/test-data-single/single.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/test/ml/input/test-data-empty/empty.txt b/test/ml/input/test-data-empty/empty.txt index e69de29b..63fda2b1 100644 --- a/test/ml/input/test-data-empty/empty.txt +++ b/test/ml/input/test-data-empty/empty.txt @@ -0,0 +1 @@ +Node1 Node2 Rank Direction \ No newline at end of file diff --git a/test/ml/input/test-data-single/single.txt b/test/ml/input/test-data-single/single.txt index 30397283..822ccb97 100644 --- a/test/ml/input/test-data-single/single.txt +++ b/test/ml/input/test-data-single/single.txt @@ -1 +1,2 @@ +Node1 Node2 Rank Direction L M 1 U From 1ffe1d7cd75c20af85074ef23cd054f248c0711b Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 17 Jun 2024 10:02:25 -0500 Subject: [PATCH 23/26] output docs --- doc/output.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 doc/output.md diff --git a/doc/output.md b/doc/output.md new file mode 100644 index 00000000..0b0a6c93 --- /dev/null +++ b/doc/output.md @@ -0,0 +1,15 @@ +## File format +### Universal Output File +Output files include a header row and rows providing attributes for each node. +The header row is `Node1 Node2 Rank Direction` for every output file. +Each row lists the two nodes that are connected with an edge, the rank for that edge, and a directionality column to indicate whether the edge is directed or undirected. +The directionality values are either a 'U' for an undirected edge or a 'D' for a directed edge. + +For example: +``` +Node1 Node2 Rank Direction +A B 1 D +B C 1 D +B D 1 U +D A 1 U +``` From 7db6ea020092c0feadf225bfb4e410b5f94ada2f Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Thu, 4 Jul 2024 08:52:16 -0500 Subject: [PATCH 24/26] Code review and formatting updates --- .github/workflows/test-spras.yml | 1 + .pre-commit-config.yaml | 2 +- doc/output.md | 20 +++++++++++--------- spras/allpairs.py | 4 +--- spras/analysis/ml.py | 6 +++--- spras/analysis/summary.py | 3 +-- spras/domino.py | 13 ++++--------- spras/meo.py | 9 ++++----- spras/mincostflow.py | 7 +++---- spras/omicsintegrator1.py | 11 +++-------- spras/omicsintegrator2.py | 5 ++--- spras/pathlinker.py | 5 ++--- spras/util.py | 18 ++++++++++-------- test/ml/test_ml.py | 4 ++-- test/parse-outputs/test_parse_outputs.py | 4 +--- 15 files changed, 49 insertions(+), 63 deletions(-) diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index ec252c8f..b801aa4e 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -83,6 +83,7 @@ jobs: docker pull reedcompbio/mincostflow:latest docker pull reedcompbio/allpairs:latest docker pull reedcompbio/domino:latest + docker pull reedcompbio/py4cytoscape:v3 - name: Build Omics Integrator 1 Docker image uses: docker/build-push-action@v1 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 55503ef4..67958453 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ # See https://pre-commit.com/ for documentation default_language_version: # Match this to the version specified in environment.yml - python: python3.8 + python: python3.11 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 # Use the ref you want to point at diff --git a/doc/output.md b/doc/output.md index 0b0a6c93..431d67c7 100644 --- a/doc/output.md +++ b/doc/output.md @@ -1,15 +1,17 @@ -## File format -### Universal Output File -Output files include a header row and rows providing attributes for each node. -The header row is `Node1 Node2 Rank Direction` for every output file. +## File formats + +### Pathway output format +Output pathway files in the standard SPRAS format include a header row and rows providing attributes for each edge. +The header row is `Node1 Node2 Rank Direction`. Each row lists the two nodes that are connected with an edge, the rank for that edge, and a directionality column to indicate whether the edge is directed or undirected. -The directionality values are either a 'U' for an undirected edge or a 'D' for a directed edge. +The directionality values are either a 'U' for an undirected edge or a 'D' for a directed edge, where the direction is from Node1 to Node2. +Pathways that do not contain ranked edges can output all 1s in the Rank column. For example: ``` Node1 Node2 Rank Direction -A B 1 D -B C 1 D -B D 1 U -D A 1 U +A B 1 D +B C 1 D +B D 2 U +D A 3 U ``` diff --git a/spras/allpairs.py b/spras/allpairs.py index cdce7e40..4c540330 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -1,8 +1,6 @@ import warnings from pathlib import Path -import pandas as pd - from spras.containers import prepare_volume, run_container from spras.interactome import ( convert_directed_to_undirected, @@ -112,7 +110,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - df = raw_pathway_df(raw_pathway_file, header=None) + df = raw_pathway_df(raw_pathway_file, sep='\t', header=None) if not df.empty: df = add_rank_column(df) df = reinsert_direction_col_undirected(df) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 5519944f..64f8f003 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -42,12 +42,12 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra lines = f.readlines() if len(lines) > 0: - lines.pop(0) # process header line + lines.pop(0) # skip header line edges = [] for line in lines: parts = line.split('\t') - if len(parts) == 4: # empty lines not allowed but empty files are allowed + if len(parts) == 4: # empty lines not allowed but empty files are allowed node1 = parts[0] node2 = parts[1] direction = str(parts[3]).strip() @@ -57,7 +57,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra elif direction == "D": # node order does matter for directed edges edges.append(DIR_CONST.join([node1, node2])) - elif direction != 'Direction' and direction != 'U' and direction != 'D': + elif direction != 'Direction': raise ValueError(f"direction is {direction}, rather than U or D") elif len(parts) != 0: raise ValueError(f"In file {file}, expected line {line} to have 4 values, but found {len(parts)} values.") diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py index 0a173521..0e4b4b86 100644 --- a/spras/analysis/summary.py +++ b/spras/analysis/summary.py @@ -34,9 +34,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> # Iterate through each network file path for file_path in sorted(file_paths): - lines = None with open(file_path, 'r') as f: - lines = f.readlines()[1:] # skip the first line + lines = f.readlines()[1:] # skip the header line nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str))) diff --git a/spras/domino.py b/spras/domino.py index 9e6b256e..7c29f439 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -206,10 +206,10 @@ def parse_output(raw_pathway_file, standardized_pathway_file): edges_df['target'] = edges_df['target'].apply(post_domino_id_transform) edges_df = reinsert_direction_col_undirected(edges_df) edges_df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] - edges_df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False) else: - df = pd.DataFrame(columns = ['Node1', 'Node2', 'Rank', 'Direction']) - df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False) + edges_df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction']) + + edges_df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False) def pre_domino_id_transform(node_id): @@ -228,9 +228,4 @@ def post_domino_id_transform(node_id): @param node_id: the node id to transform @return the node id without the prefix, if it was present, otherwise the original node id """ - # Use removeprefix if SPRAS ever requires Python >= 3.9 - # https://docs.python.org/3/library/stdtypes.html#str.removeprefix - if node_id.startswith(ID_PREFIX): - return node_id[ID_PREFIX_LEN:] - else: - return node_id + node_id.removeprefix(ID_PREFIX) diff --git a/spras/meo.py b/spras/meo.py index c854a64f..b614d4c4 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -1,7 +1,5 @@ from pathlib import Path -import pandas as pd - from spras.containers import prepare_volume, run_container from spras.interactome import ( add_directionality_constant, @@ -180,14 +178,15 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - df = raw_pathway_df(raw_pathway_file, header=0) + # Columns Source Type Target Oriented Weight + df = raw_pathway_df(raw_pathway_file, sep='\t', header=0) if not df.empty: - # Keep only edges that were assigned an orientation (direction) + # Keep only edges that were assigned an orientation (direction) df = df.loc[df['Oriented']] # TODO what should be the edge rank? # Would need to load the paths output file to rank edges correctly df = add_rank_column(df) df = reinsert_direction_col_directed(df) - df.drop(columns=['Type', 'Oriented', 'Weight'], inplace = True) + df.drop(columns=['Type', 'Oriented', 'Weight'], inplace=True) df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df.to_csv(standardized_pathway_file, index=False, sep='\t', header=True) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 531ae532..b0a47cc7 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -1,7 +1,5 @@ from pathlib import Path -import pandas as pd - from spras.containers import prepare_volume, run_container from spras.interactome import ( convert_undirected_to_directed, @@ -150,10 +148,11 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param standardized_pathway_file: the same pathway written in the universal format """ - df = raw_pathway_df(raw_pathway_file, header=None) + df = raw_pathway_df(raw_pathway_file, sep='\t', header=None) if not df.empty: df = add_rank_column(df) + # TODO update MinCostFlow version to support mixed graphs + # Currently directed edges in the input will be converted to undirected edges in the output df = reinsert_direction_col_undirected(df) df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') - diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 4c5e5164..16469924 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -1,8 +1,5 @@ -import os from pathlib import Path -import pandas as pd - from spras.containers import prepare_volume, run_container from spras.interactome import reinsert_direction_col_mixed from spras.prm import PRM @@ -192,14 +189,12 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # I'm assuming from having read the documentation that we will be passing in optimalForest.sif # as raw_pathway_file, in which case the format should be edge1 interactiontype edge2. # if that assumption is wrong we will need to tweak things - - - df = raw_pathway_df(raw_pathway_file, header=None) + df = raw_pathway_df(raw_pathway_file, sep='\t', header=None) if not df.empty: df.columns = ["Edge1", "InteractionType", "Edge2"] df = add_rank_column(df) df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp") - df.drop(columns=['InteractionType'], inplace = True) - df.columns = ['Node1', 'Node2','Rank','Direction'] + df.drop(columns=['InteractionType'], inplace=True) + df.columns = ['Node1', 'Node2', 'Rank', 'Direction'] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 2d8dee3c..ed0d5b56 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -6,7 +6,7 @@ from spras.dataset import Dataset from spras.interactome import reinsert_direction_col_undirected from spras.prm import PRM -from spras.util import add_rank_column, raw_pathway_df +from spras.util import add_rank_column __all__ = ['OmicsIntegrator2'] @@ -149,7 +149,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file): # Omicsintegrator2 returns a single line file if no network is found num_lines = sum(1 for line in open(raw_pathway_file)) if num_lines < 2: - df = pd.DataFrame(columns = ['Node1', 'Node2', 'Rank', 'Direction']) + df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction']) else: df = pd.read_csv(raw_pathway_file, sep='\t', header=0) df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line @@ -159,4 +159,3 @@ def parse_output(raw_pathway_file, standardized_pathway_file): df.columns = ['Node1', 'Node2', 'Rank', "Direction"] df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t') - diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 97ee64ad..876c22b0 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -1,8 +1,6 @@ import warnings from pathlib import Path -import pandas as pd - from spras.containers import prepare_volume, run_container from spras.interactome import ( convert_undirected_to_directed, @@ -137,7 +135,8 @@ def parse_output(raw_pathway_file, standardized_pathway_file): @param raw_pathway_file: pathway file produced by an algorithm's run function @param standardized_pathway_file: the same pathway written in the universal format """ - df = raw_pathway_df(raw_pathway_file, header=0) + # What about multiple raw_pathway_files? + df = raw_pathway_df(raw_pathway_file, sep='\t', header=0) if not df.empty: df = df.take([0, 1, 2], axis=1) df = reinsert_direction_col_directed(df) diff --git a/spras/util.py b/spras/util.py index efa9bc73..2d2a83d2 100644 --- a/spras/util.py +++ b/spras/util.py @@ -42,6 +42,7 @@ def hash_filename(filename: str, length: Optional[int] = None) -> str: """ return hash_params_sha1_base32({'filename': filename}, length) + def make_required_dirs(path: str): """ Create the directory and parent directories required before an output file can be written to the specified path. @@ -60,17 +61,18 @@ def add_rank_column(df: pd.DataFrame) -> pd.DataFrame: df['Rank'] = 1 return df -def raw_pathway_df(raw_pathway_file: str, header:int= None) -> pd.DataFrame: + +def raw_pathway_df(raw_pathway_file: str, sep: str = '\t', header: int = None) -> pd.DataFrame: """ - Creates DF from contents in raw pathway file, - otherwise returns an empty DF with standard output column names + Creates dataframe from contents in raw pathway file, + otherwise returns an empty dataframe with standard output column names @param raw_pathway_file: path to raw_pathway_file - @param header: what row the header is in raw_pathway_file, otherwise None - + @param sep: separator used when loading the dataframe, default tab character + @param header: what row the header is in raw_pathway_file, default None """ try: - df = pd.read_csv(raw_pathway_file, sep='\t', header=header) - except pd.errors.EmptyDataError: # read an empty file - df = pd.DataFrame(columns = ['Node1', 'Node2','Rank','Direction']) + df = pd.read_csv(raw_pathway_file, sep=sep, header=header) + except pd.errors.EmptyDataError: # read an empty file + df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction']) return df diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index e718cf3f..3010179d 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -43,11 +43,11 @@ def test_summarize_networks_wrong_direction(self): ml.summarize_networks([INPUT_DIR + 'test-data-wrong-direction/wrong-direction.txt']) def test_summarize_networks_empty(self): - with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing + with pytest.raises(ValueError): # raises error if empty dataframe is used for post processing ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt']) def test_single_line(self): - with pytest.raises(ValueError): #raises error if single line in file s.t. single row in dataframe is used for post processing + with pytest.raises(ValueError): # raises error if single line in file s.t. single row in dataframe is used for post processing ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt']) def test_pca(self): diff --git a/test/parse-outputs/test_parse_outputs.py b/test/parse-outputs/test_parse_outputs.py index 0f471e5d..60763d13 100644 --- a/test/parse-outputs/test_parse_outputs.py +++ b/test/parse-outputs/test_parse_outputs.py @@ -1,9 +1,6 @@ import filecmp from pathlib import Path -import pandas as pd -import pytest - from spras import runner INDIR = "test/parse-outputs/input/" @@ -37,5 +34,6 @@ def test_empty_file(self): for algo in algorithms: test_file = INDIR + f"empty-raw-pathway.txt" out_file = OUTDIR + f"{algo}-empty-pathway.txt" + runner.parse_output(algo, test_file, out_file) assert filecmp.cmp(OUTDIR + f"{algo}-empty-pathway.txt", EXPDIR + f"empty-pathway-expected.txt", shallow=False) From 026e7e0be8386c16a5f70474ab5b940e664b81ae Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Thu, 4 Jul 2024 08:56:50 -0500 Subject: [PATCH 25/26] Bump version to 0.2.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 68d10f5c..d19a5988 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "spras" -version = "0.1.0" +version = "0.2.0" description = "Signaling Pathway Reconstruction Analysis Streamliner" authors = [ { name = "Anthony Gitter", email = "gitter@biostat.wisc.edu" }, From d6b019aa17144f460d424af7083157a25010e84b Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Thu, 4 Jul 2024 10:27:39 -0500 Subject: [PATCH 26/26] Fix post_domino_id_transform --- spras/domino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/domino.py b/spras/domino.py index 7c29f439..3170f64d 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -228,4 +228,4 @@ def post_domino_id_transform(node_id): @param node_id: the node id to transform @return the node id without the prefix, if it was present, otherwise the original node id """ - node_id.removeprefix(ID_PREFIX) + return node_id.removeprefix(ID_PREFIX)