diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 3010179d..2bf90f14 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -61,6 +61,31 @@ def test_pca(self): assert coord.equals(expected) + def test_pca_robustness(self): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) + expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') + expected = expected.round(5) + for _ in range(5): + dataframe_shuffled = dataframe.sample(frac=1, axis=1) # permute the columns + ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt', + OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') + coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') + coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines + coord.sort_values(by='algorithm', ignore_index=True, inplace=True) + + assert coord.equals(expected) + + for _ in range(5): + dataframe_shuffled = dataframe.sample(frac=1, axis=0) # permute the rows + ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-rows.png', OUT_DIR + 'pca-shuffled-rows-variance.txt', + OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') + coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') + coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines + coord.sort_values(by='algorithm', ignore_index=True, inplace=True) + + assert coord.equals(expected) + + def test_hac_horizontal(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) ml.hac_horizontal(dataframe, OUT_DIR + 'hac-horizontal.png', OUT_DIR + 'hac-clusters-horizontal.txt')