Reed-CompBio · ntalluri · Oct 25, 2024 · Oct 25, 2024 · ntalluri · Nov 18, 2024
diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
@@ -78,6 +78,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
                 str(tup[0]): 1,
             }, index=tup[1]
         )
+        dataframe = dataframe[~dataframe.index.duplicated(keep='first')]
-        dataframe = dataframe[~dataframe.index.duplicated(keep='first')]
+        # Mark the first occurrence of a duplicate as True and all others as False
+        # Non-duplicates are also marked as True
+        dataframe = dataframe[~dataframe.index.duplicated(keep='first')]
-        dataframe = dataframe[~dataframe.index.duplicated(keep='first')]
+        # Mark the first occurrence of a duplicate as True and all others as False
+        # Non-duplicates are also marked as True
+        dataframe = dataframe[~dataframe.index.duplicated(keep='first')]
         edge_dataframes.append(dataframe)
 
     # concatenating all the algorithm-specific dataframes together

diff --git a/test/ml/expected/expected-dataframe.csv b/test/ml/expected/expected-dataframe.csv
@@ -1,16 +1,16 @@
-,test-data-s1,test-data-s2,test-data-s3,test-data-longName,test-data-longName2,test-data-empty,test-data-spaces,test-data-mixed-direction
-A---B,1,1,0,0,0,0,0,0
-C---D,1,1,0,0,0,0,0,1
-E---F,1,1,0,0,0,0,0,1
-L---M,0,1,1,0,0,0,1,0
-M---N,0,0,1,0,0,0,0,0
-O---P,0,0,1,0,0,0,1,0
-P---Q,0,0,1,0,0,0,0,0
-node1---node2,0,0,0,1,0,0,0,0
-node1---node3,0,0,0,1,1,0,0,0
-node4---node5,0,0,0,1,1,0,0,0
-LONGERNAMES---TEST,0,0,0,1,1,0,0,0
-node2---node3,0,0,0,0,1,0,0,0
-nodes with---spaces in name,0,0,0,0,0,0,1,0
-A-->B,0,0,0,0,0,0,0,1
-B-->A,0,0,0,0,0,0,0,1
+,test-data-s1,test-data-s2,test-data-s3,test-data-longName,test-data-longName2,test-data-empty,test-data-spaces,test-data-mixed-direction,test-repeat-edges
+A---B,1,1,0,0,0,0,0,0,1
+C---D,1,1,0,0,0,0,0,1,1
+E---F,1,1,0,0,0,0,0,1,1
+L---M,0,1,1,0,0,0,1,0,0
+M---N,0,0,1,0,0,0,0,0,0
+O---P,0,0,1,0,0,0,1,0,0
+P---Q,0,0,1,0,0,0,0,0,0
+node1---node2,0,0,0,1,0,0,0,0,0
+node1---node3,0,0,0,1,1,0,0,0,0
+node4---node5,0,0,0,1,1,0,0,0,0
+LONGERNAMES---TEST,0,0,0,1,1,0,0,0,0
+node2---node3,0,0,0,0,1,0,0,0,0
+nodes with---spaces in name,0,0,0,0,0,0,1,0,0
+A-->B,0,0,0,0,0,0,0,1,0
+B-->A,0,0,0,0,0,0,0,1,0
diff --git a/test/ml/input/test-repeat-edges/repeat.txt b/test/ml/input/test-repeat-edges/repeat.txt
@@ -0,0 +1,5 @@
+Node1	Node2	Rank	Direction
+A	B	1	U
+C	D	1	U
+E	F	1	U
+A	B	1	U
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
@@ -22,7 +22,7 @@ def setup_class(cls):
     def test_summarize_networks(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt',
                                            INPUT_DIR + 'test-data-longName/longName.txt', INPUT_DIR + 'test-data-longName2/longName2.txt',
-                                           INPUT_DIR + 'test-data-empty/empty.txt', INPUT_DIR + 'test-data-spaces/spaces.txt', INPUT_DIR + 'test-data-mixed-direction/mixed-direction.txt'])
+                                           INPUT_DIR + 'test-data-empty/empty.txt', INPUT_DIR + 'test-data-spaces/spaces.txt', INPUT_DIR + 'test-data-mixed-direction/mixed-direction.txt',  INPUT_DIR + 'test-repeat-edges/repeat.txt'])
         dataframe.to_csv(OUT_DIR + 'dataframe.csv')
         assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False)