From 6ae266646a6f1a7c640d489f1b8cff9e383cdc82 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 10 Sep 2024 11:24:27 -0500 Subject: [PATCH 1/4] added pca robustness test by shuffling rows and columns of matrix --- test/ml/test_ml.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 3010179d..7df53c02 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -1,4 +1,5 @@ import filecmp +import random from pathlib import Path import pandas as pd @@ -61,6 +62,34 @@ def test_pca(self): assert coord.equals(expected) + def test_pca_robustness(self): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) + + for _i in range(5): + dataframe_shuffled = dataframe.sample(frac=1, axis=1) # permute the columns + ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt', + OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') + coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') + coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines + coord.sort_values(by='algorithm', ignore_index=True, inplace=True) + expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') + expected = expected.round(5) + + assert coord.equals(expected) + + for _i in range(5): + dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows + ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-rows.png', OUT_DIR + 'pca-shuffled-rows-variance.txt', + OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') + coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') + coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines + coord.sort_values(by='algorithm', ignore_index=True, inplace=True) + expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') + expected = expected.round(5) + + assert coord.equals(expected) + + def test_hac_horizontal(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) ml.hac_horizontal(dataframe, OUT_DIR + 'hac-horizontal.png', OUT_DIR + 'hac-clusters-horizontal.txt') From e2e4f00af5bfe04292b6932755e0a142aac0a340 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 10 Sep 2024 11:27:03 -0500 Subject: [PATCH 2/4] cleaned up --- test/ml/test_ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 7df53c02..9a517314 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -65,7 +65,7 @@ def test_pca(self): def test_pca_robustness(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) - for _i in range(5): + for _ in range(5): dataframe_shuffled = dataframe.sample(frac=1, axis=1) # permute the columns ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt', OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') @@ -77,7 +77,7 @@ def test_pca_robustness(self): assert coord.equals(expected) - for _i in range(5): + for _ in range(5): dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-rows.png', OUT_DIR + 'pca-shuffled-rows-variance.txt', OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') From 8188a1252830e16308e3a65bd753220661d49a67 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 10 Sep 2024 11:28:38 -0500 Subject: [PATCH 3/4] moved expected --- test/ml/test_ml.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 9a517314..e30ce553 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -64,7 +64,8 @@ def test_pca(self): def test_pca_robustness(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) - + expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') + expected = expected.round(5) for _ in range(5): dataframe_shuffled = dataframe.sample(frac=1, axis=1) # permute the columns ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt', @@ -72,8 +73,6 @@ def test_pca_robustness(self): coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines coord.sort_values(by='algorithm', ignore_index=True, inplace=True) - expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') - expected = expected.round(5) assert coord.equals(expected) @@ -84,8 +83,6 @@ def test_pca_robustness(self): coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines coord.sort_values(by='algorithm', ignore_index=True, inplace=True) - expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') - expected = expected.round(5) assert coord.equals(expected) From 1bf6bb71da98a05607ba25b60141b1b59884ab5c Mon Sep 17 00:00:00 2001 From: Anthony Gitter Date: Tue, 10 Sep 2024 13:26:27 -0500 Subject: [PATCH 4/4] Remove unused import --- test/ml/test_ml.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index e30ce553..2bf90f14 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -1,5 +1,4 @@ import filecmp -import random from pathlib import Path import pandas as pd @@ -77,7 +76,7 @@ def test_pca_robustness(self): assert coord.equals(expected) for _ in range(5): - dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows + dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-rows.png', OUT_DIR + 'pca-shuffled-rows-variance.txt', OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv')