Reed-CompBio · agitter · Dec 3, 2023 · Aug 23, 2023 · Aug 29, 2023 · Aug 30, 2023
diff --git a/config/config.yaml b/config/config.yaml
@@ -29,14 +29,12 @@
         - name: "pathlinker"
           params:
                 include: true
-                directed: true
                 run1:
                     k: range(100,201,100)
 
         - name: "omicsintegrator1"
           params:
                 include: true
-                directed: false
                 run1:
                     r: [5]
                     b: [5, 6]
@@ -47,7 +45,6 @@
         - name: "omicsintegrator2"
           params:
                 include: true
-                directed: false
                 run1:
                     b: [4]
                     g: [0]
@@ -58,7 +55,6 @@
         - name: "meo"
           params:
                 include: true
-                directed: true
                 run1:
                     max_path_length: [3]
                     local_search: ["Yes"]
@@ -67,20 +63,17 @@
         - name: "mincostflow"
           params:
                 include: true
-                directed: false
                 run1:
                     flow: [1] # The flow must be an int
                     capacity: [1]
 
         - name: "allpairs"
           params:
                 include: true
-                directed: false
 
         - name: "domino"
           params:
                 include: true
-                directed: false
                 run1:
                     slice_threshold: [0.3]
                     module_threshold: [0.05]
@@ -125,13 +118,13 @@
  analysis:
         # Create one summary per pathway file and a single summary table for all pathways for each dataset
         summary:
-          include: true
+          include: false
         # Create output files for each pathway that can be visualized with GraphSpace
         graphspace:
-          include: true
+          include: false
         # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
         ml:
-          include: true
+          include: false
           # specify how many principal components to calculate
           components: 2
           # boolean to show the labels on the pca graph

diff --git a/input/alternative-network.txt b/input/alternative-network.txt
@@ -1,9 +1,9 @@
-A	B	0.98
-B	C	0.77
-A	D	0.12
-C	D	0.89
-C	E	0.59
-C	F	0.50
-F	G	0.76
-G	H	0.92
-G	I	0.66
+A	B	0.98	U
+B	C	0.77	U
+A	D	0.12	U
+C	D	0.89	U
+C	E	0.59	U
+C	F	0.50	U
+F	G	0.76	U
+G	H	0.92	U
+G	I	0.66	U
diff --git a/input/network.txt b/input/network.txt
@@ -1,2 +1,2 @@
-A	B	0.98
-B	C	0.77
+A	B	0.98	U
+B	C	0.77	U
diff --git a/src/allpairs.py b/src/allpairs.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 
+from src.dataset import convert_directed_to_undirected, readd_direction_col_undirected
 from src.prm import PRM
 from src.util import prepare_volume, run_container
 
@@ -42,8 +43,12 @@ def generate_inputs(data, filename_map):
 
         input_df.to_csv(filename_map["nodetypes"], sep="\t", index=False, columns=["#Node", "Node type"])
 
+        # Create network file
+        edges_df = data.get_interactome()
+        # Format network file
+        edges_df = convert_directed_to_undirected(edges_df)
         # This is pretty memory intensive. We might want to keep the interactome centralized.
-        data.get_interactome().to_csv(filename_map["network"], sep="\t", index=False,
+        edges_df.to_csv(filename_map["network"], sep="\t", index=False,
                                       columns=["Interactor1", "Interactor2", "Weight"],
                                       header=["#Interactor1", "Interactor2", "Weight"])
 
@@ -100,4 +105,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         """
         df = pd.read_csv(raw_pathway_file, sep='\t', header=None)
         df['Rank'] = 1  # add a rank column of 1s since the edges are not ranked.
+        df = readd_direction_col_undirected(df, 2)
         df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
diff --git a/src/dataset.py b/src/dataset.py
@@ -14,7 +14,7 @@
 class Dataset:
 
     NODE_ID = "NODEID"
-    warning_threshold = 0.05 #Threshold for scarcity of columns to warn user
+    warning_threshold = 0.05  # Threshold for scarcity of columns to warn user
 
     def __init__(self, dataset_dict):
         self.label = None
@@ -63,25 +63,57 @@ def load_files_from_dict(self, dataset_dict):
 
         self.label = dataset_dict["label"]
 
-        #Get file paths from config
+        # Get file paths from config
         # TODO support multiple edge files
         interactome_loc = dataset_dict["edge_files"][0]
         node_data_files = dataset_dict["node_files"]
-        #edge_data_files = [""]  # Currently None
+        # edge_data_files = [""]  # Currently None
         data_loc = dataset_dict["data_dir"]
 
-        #Load everything as pandas tables
-        self.interactome = pd.read_table(os.path.join(data_loc,interactome_loc), names = ["Interactor1","Interactor2","Weight"])
+        # Load everything as pandas tables
+        print("about to create self.interactome")
+        print(data_loc)
+        print(interactome_loc)
+
+        with open(os.path.join(data_loc, interactome_loc), "r") as f:
+            for _i in range(9):  # first 5 lines
+                print(f.readline())
+
+        self.interactome = pd.read_table(
+            os.path.join(data_loc, interactome_loc), sep="\t", header=None
+        )
+        print(self.interactome)
+        num_cols = self.interactome.shape[1]
+        print(num_cols)
+        if num_cols == 3:
+
+            self.interactome.columns = ["Interactor1", "Interactor2", "Weight"]
+            self.interactome["Direction"] = "U"
+
+        elif num_cols == 4:
+            self.interactome.columns = [
+                "Interactor1",
+                "Interactor2",
+                "Weight",
+                "Direction",
+            ]
+        else:
+            raise ValueError(
+                f"edge_files must have three or four columns but found {num_cols}"
+            )
+
         node_set = set(self.interactome.Interactor1.unique())
         node_set = node_set.union(set(self.interactome.Interactor2.unique()))
 
-        #Load generic node tables
+        # Load generic node tables
         self.node_table = pd.DataFrame(node_set, columns=[self.NODE_ID])
         for node_file in node_data_files:
-            single_node_table = pd.read_table(os.path.join(data_loc,node_file))
-            #If we have only 1 column, assume this is an indicator variable
-            if len(single_node_table.columns)==1:
-                single_node_table = pd.read_table(os.path.join(data_loc,node_file),header=None)
+            single_node_table = pd.read_table(os.path.join(data_loc, node_file))
+            # If we have only 1 column, assume this is an indicator variable
+            if len(single_node_table.columns) == 1:
+                single_node_table = pd.read_table(
+                    os.path.join(data_loc, node_file), header=None
+                )
                 single_node_table.columns = [self.NODE_ID]
                 new_col_name = node_file.split(".")[0]
                 single_node_table[new_col_name] = True
@@ -91,7 +123,9 @@ def load_files_from_dict(self, dataset_dict):
             # will be ignored
             # TODO may want to warn about duplicate before removing them, for instance, if a user loads two files that
             #  both have prizes
-            self.node_table = self.node_table.merge(single_node_table, how="left", on=self.NODE_ID, suffixes=(None, "_DROP")).filter(regex="^(?!.*DROP)")
+            self.node_table = self.node_table.merge(
+                single_node_table, how="left", on=self.NODE_ID, suffixes=(None, "_DROP")
+            ).filter(regex="^(?!.*DROP)")
         # Ensure that the NODEID column always appears first, which is required for some downstream analyses
         self.node_table.insert(0, "NODEID", self.node_table.pop("NODEID"))
         self.other_files = dataset_dict["other_files"]
@@ -103,11 +137,18 @@ def request_node_columns(self, col_names):
         """
         col_names.append(self.NODE_ID)
         filtered_table = self.node_table[col_names]
-        filtered_table = filtered_table.dropna(axis=0, how='all',subset=filtered_table.columns.difference([self.NODE_ID]))
-        percent_hit = (float(len(filtered_table))/len(self.node_table))*100
-        if percent_hit <= self.warning_threshold*100:
+        filtered_table = filtered_table.dropna(
+            axis=0, how="all", subset=filtered_table.columns.difference([self.NODE_ID])
+        )
+        percent_hit = (float(len(filtered_table)) / len(self.node_table)) * 100
+        if percent_hit <= self.warning_threshold * 100:
             # Only use stacklevel 1 because this is due to the data not the code context
-            warnings.warn("Only %0.2f of data had one or more of the following columns filled:"%(percent_hit) + str(col_names), stacklevel=1)
+            warnings.warn(
+                "Only %0.2f of data had one or more of the following columns filled:"
+                % (percent_hit)
+                + str(col_names),
+                stacklevel=1,
+            )
         return filtered_table
 
     def contains_node_columns(self, col_names):
@@ -131,3 +172,147 @@ def get_other_files(self):
 
     def get_interactome(self):
         return self.interactome.copy()
+
+def convert_undirected_to_directed(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    turns a graph into a fully directed graph
+    - turns every unidirected edges into a bi-directed edge
+    - with bi-directed edges, we are not loosing too much information because the relationship of the undirected egde is still preserved
+
+   *A user must keep the Direction column when using this function
+
+    @param df: input network df of edges, weights, and directionality
+    @return a dataframe with no undirected edges in Direction column
+    """
+
+    # TODO: add a check to make sure there is a direction column in df
+
+    for index, row in df.iterrows():
+        if row["Direction"] == "U":
+            df.at[index, "Direction"] = "D"
+
+            new_directed_row = row.copy(deep=True)
+            new_directed_row["Interactor1"], new_directed_row["Interactor2"] = (
+                row["Interactor2"],
+                row["Interactor1"],
+            )
+            print("new directed row\n", new_directed_row)
+            new_directed_row["Direction"] = "D"
+            df.loc[len(df)] = new_directed_row
+
+    return df
+
+
+def convert_directed_to_undirected(df: pd.DataFrame) -> pd.DateOffset:
+    """
+    turns a graph into a fully undirected graph
+    - turns the directed edges directly into undirected edges
+    - we will loose any sense of directionality and the graph won't be inherently accurate, but the basic relationship between the two connected nodes will still remain intact.
+
+    @param df: input network df of edges, weights, and directionality
+    @return a dataframe with no directed edges in Direction column
+    """
+
+    for index, row in df.iterrows():
+        if row["Direction"] == "D":
+            df.at[index, "Direction"] = "U"
+
+    return df
+
+
+def add_seperator(df: pd.DataFrame, col_loc: int, col_name: str, sep: str) -> pd.DataFrame:
+    """
+    adds a seperator somewhere into the input dataframe
+
+    @param df: input network df of edges, weights, and directionality
+    @param col_loc: the spot in the dataframe to put the new column
+    @param col_name: the name of the new column
+    @param sep: some type of seperator needed in the df
+    @return a df with a new seperator added to every row
+    """
+
+    df.insert(col_loc, col_name, sep)
+    return df
+
+
+def add_directionality_seperators(df: pd.DataFrame, col_loc: int, col_name: str, dir_sep: str, undir_sep: str) -> pd.DataFrame:
+    """
+    deals with adding in directionality seperators for mixed graphs that isn't in the universal input
+
+    *user must keep the Direction column when using the function
+
+    @param df: input network df of edges, weights, and directionality
+    @param col_loc: the spot in the dataframe to put the new column
+    @param col_name: the name of the new column
+    @param dir_sep: the directed edge sep
+    @param undir_sep: the undirected edge sep
+    @return a df converted to show directionality differently
+    """
+
+    # TODO: add a check to make sure there is a direction column in df
+
+    df.insert(col_loc, col_name, dir_sep)
+
+    for index, row in df.iterrows():
+        if row["Direction"] == "U":
+            df.at[index, col_name] = undir_sep
+        elif row["Direction"] == "D":
+            continue
+        else:
+            raise ValueError(
+                f'direction must be a \'U\' or \'D\', but found {row["Direction"]}'
+            )
+
+    return df
+
+def readd_direction_col_mixed(df: pd.DataFrame, direction_col_loc: int, existing_direction_column: str, dir_sep: str, undir_sep: str) -> pd.DataFrame:
+    """
+    readds a 'Direction' column that puts a 'U' or 'D' based on the dir/undir seperators in the existing direction column
+
+    *user must keep the existing direction column when using the function
+
+    @param df: input network df that contains directionality
+    @param direction_col_loc: the spot in the dataframe to put back the 'Direction' column
+    @param existing_direction_column: the name of the existing directionality column
+    @param dir_sep: the directed edge sep
+    @param undir_sep: the undirected edge sep
+    @return a df with Direction column added back
+    """
+
+    df.insert(direction_col_loc, "Direction", "D")
+
+    for index, row in df.iterrows():
+        if row[existing_direction_column] == undir_sep:
+            df.at[index, "Direction"] = "U"
+
+        elif row[existing_direction_column] == dir_sep:
+            df.at[index, "Direction"] = "D"
+
+        else:
+            raise ValueError(
+                f'direction must be a \'{dir_sep}\' or \'{undir_sep}\', but found {row[existing_direction_column]}'
+            )
+
+    return df
+
+def readd_direction_col_undirected(df: pd.DataFrame, direction_col_loc: int) -> pd.DataFrame:
+    """
+    readds a 'Direction' column that puts a 'U'
+
+    @param df: input network df that contains directionality
+    @param direction_col_loc: the spot in the dataframe to put back the 'Direction' column
+    @return a df with Direction column added back
+    """
+    df.insert(direction_col_loc, "Direction", "U")
+    return df
+
+def readd_direction_col_directed(df: pd.DataFrame, direction_col_loc: int) -> pd.DataFrame:
+    """
+    readds a 'Direction' column that puts a 'D'
+
+    @param df: input network df that contains directionality
+    @param direction_col_loc: the spot in the dataframe to put back the 'Direction' column
+    @return a df with Direction column added back
+    """
+    df.insert(direction_col_loc, "Direction", "D")
+    return df