Merge pull request #182 from ntalluri/issue-133

Omics Integrator 2 Testing Error
Reed-CompBio · Sep 22, 2024 · 7b07916 · 7b07916
2 parents cd1efd7 + 2e2c5c1
commit 7b07916
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 5 deletions.
diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py
@@ -148,14 +148,20 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         """
         # Omicsintegrator2 returns a single line file if no network is found
         num_lines = sum(1 for line in open(raw_pathway_file))
+        # Omicsintegrator2 has corrupted output; list of correct column names
+        sorted_correct_column_names = ['cost', 'in_solution', 'protein1', 'protein2'] # the order of edge attributes in the NetworkX graph is not guaranteed.
+
         if num_lines < 2:
             df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
         else:
             df = pd.read_csv(raw_pathway_file, sep='\t', header=0)
-            df = df[df['in_solution'] == True]  # Check whether this column can be empty before revising this line
-            df = df.take([0, 1], axis=1)
-            df = add_rank_column(df)
-            df = reinsert_direction_col_undirected(df)
-            df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+            if sorted(df.columns) == sorted_correct_column_names: # if column header names are all correct
+                df = df[df['in_solution'] == True]  # the 'in_solution' column exists when the forest is not empty.
+                df = df.take([0, 1], axis=1) # the first two columns in the df will be 'protein1' and 'protein2', followed by the edge attributes.
+                df = add_rank_column(df)
+                df = reinsert_direction_col_undirected(df)
+                df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+            else: # corrupted data
+                df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
 
         df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
diff --git a/...outputs/input/omicsintegrator-edge-cases/omicsintegrator2-miss-insolution-raw-pathway.txt b/...outputs/input/omicsintegrator-edge-cases/omicsintegrator2-miss-insolution-raw-pathway.txt
@@ -0,0 +1,3 @@
+protein1	protein2	cost
+B	A	0.52
+B	C	0.73
diff --git a/...rse-outputs/input/omicsintegrator-edge-cases/omicsintegrator2-wrong-order-raw-pathway.txt b/...rse-outputs/input/omicsintegrator-edge-cases/omicsintegrator2-wrong-order-raw-pathway.txt
@@ -0,0 +1,3 @@
+protein1	protein2	in_solution	cost
+B	A	True	0.52
+B	C	True	0.73
diff --git a/test/parse-outputs/test_parse_outputs.py b/test/parse-outputs/test_parse_outputs.py
@@ -6,6 +6,7 @@
 INDIR = "test/parse-outputs/input/"
 OUTDIR = "test/parse-outputs/output/"
 EXPDIR = "test/parse-outputs/expected/"
+OI2_EDGE_CASES_INDIR = 'test/parse-outputs/input/omicsintegrator-edge-cases/'
 
 # DOMINO input is the concatenated module_0.html and module_1.html file from
 # the DOMINO output of the network dip.sif and the nodes tnfa_active_genes_file.txt
@@ -37,3 +38,17 @@ def test_empty_file(self):
 
             runner.parse_output(algo, test_file, out_file)
             assert filecmp.cmp(OUTDIR + f"{algo}-empty-pathway.txt", EXPDIR + f"empty-pathway-expected.txt", shallow=False)
+
+    def test_oi2_miss_insolution(self):
+        test_file = OI2_EDGE_CASES_INDIR + f"omicsintegrator2-miss-insolution-raw-pathway.txt"
+        out_file = OUTDIR + f"omicsintegrator2-miss-insolution-pathway.txt"
+
+        runner.parse_output('omicsintegrator2', test_file, out_file)
+        assert filecmp.cmp(out_file, EXPDIR + f"empty-pathway-expected.txt", shallow=False)
+
+    def test_oi2_wrong_order(self):
+        test_file = OI2_EDGE_CASES_INDIR + f"omicsintegrator2-wrong-order-raw-pathway.txt"
+        out_file = OUTDIR + f"omicsintegrator2-wrong-order-pathway.txt"
+
+        runner.parse_output('omicsintegrator2', test_file, out_file)
+        assert filecmp.cmp(out_file, EXPDIR + f"omicsintegrator2-pathway-expected.txt", shallow=False)