From 6ae266646a6f1a7c640d489f1b8cff9e383cdc82 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 10 Sep 2024 11:24:27 -0500 Subject: [PATCH] added pca robustness test by shuffling rows and columns of matrix --- test/ml/test_ml.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 3010179d..7df53c02 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -1,4 +1,5 @@ import filecmp +import random from pathlib import Path import pandas as pd @@ -61,6 +62,34 @@ def test_pca(self): assert coord.equals(expected) + def test_pca_robustness(self): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) + + for _i in range(5): + dataframe_shuffled = dataframe.sample(frac=1, axis=1) # permute the columns + ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt', + OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') + coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') + coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines + coord.sort_values(by='algorithm', ignore_index=True, inplace=True) + expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') + expected = expected.round(5) + + assert coord.equals(expected) + + for _i in range(5): + dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows + ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-rows.png', OUT_DIR + 'pca-shuffled-rows-variance.txt', + OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') + coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') + coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines + coord.sort_values(by='algorithm', ignore_index=True, inplace=True) + expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') + expected = expected.round(5) + + assert coord.equals(expected) + + def test_hac_horizontal(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) ml.hac_horizontal(dataframe, OUT_DIR + 'hac-horizontal.png', OUT_DIR + 'hac-clusters-horizontal.txt')