From bed2095af18e7a5801da48f9db18a8fbb1966fda Mon Sep 17 00:00:00 2001 From: SkBlaz Date: Thu, 22 Aug 2024 09:42:06 +0200 Subject: [PATCH] Coverage alignment heuristic (#76) * cov alignment heuristic * cov heu test * more tests * tests * Setup.py * tests * overflow * proper types? * int32 * moved test --- outrank/__main__.py | 2 +- .../feature_ranking/ranking_cov_alignment.py | 28 ++++++++++ outrank/algorithms/importance_estimator.py | 3 + outrank/task_selftest.py | 10 ++-- setup.py | 2 +- tests/cov_heu_test.py | 56 +++++++++++++++++++ 6 files changed, 95 insertions(+), 6 deletions(-) create mode 100644 outrank/algorithms/feature_ranking/ranking_cov_alignment.py create mode 100644 tests/cov_heu_test.py diff --git a/outrank/__main__.py b/outrank/__main__.py index 1ccc1d2..f261dc5 100644 --- a/outrank/__main__.py +++ b/outrank/__main__.py @@ -243,7 +243,7 @@ def main(): args = parser.parse_args() if args.task == 'selftest': - conduct_self_test() + conduct_self_test('MI-numba-randomized') exit() if args.data_path is None and args.task != 'data_generator': diff --git a/outrank/algorithms/feature_ranking/ranking_cov_alignment.py b/outrank/algorithms/feature_ranking/ranking_cov_alignment.py new file mode 100644 index 0000000..094a34e --- /dev/null +++ b/outrank/algorithms/feature_ranking/ranking_cov_alignment.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import numpy as np +import numpy.typing as npt + +np.random.seed(123) +max_size = 10**6 + + +def max_pair_coverage(array1: npt.NDArray[np.int32], array2: npt.NDArray[np.int32]) -> float: + def hash_pair(el1: np.int32, el2: np.int32): + return (el1 * 1471343 - el2) % max_size + + counts = np.zeros(max_size, dtype=np.int32) + tot_len = len(array1) + for i in range(tot_len): + identifier = hash_pair(array1[i], array2[i]) + counts[identifier] += 1 + + return np.max(counts) / tot_len + + +if __name__ == '__main__': + + array1 = np.array([1,1,2,3,1,1,1,5] * 100000) + array2 = np.array([0,0,5,5,3,0,0,0] * 100000) + coverage = max_pair_coverage(array1, array2) + assert coverage == 0.5 diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index 56953c5..68fe5ab 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -18,6 +18,7 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC +from outrank.algorithms.feature_ranking import ranking_cov_alignment from outrank.core_utils import is_prior_heuristic logger = logging.getLogger('syn-logger') @@ -129,6 +130,8 @@ def get_importances_estimate_pairwise(combination, reference_model_features, arg estimate_feature_importance = sklearn_surrogate( vector_first, vector_second, X, args.heuristic, ) + elif 'max-value-coverage' in args.heuristic: + estimate_feature_importance = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second) elif 'MI-numba' in args.heuristic: estimate_feature_importance = numba_mi( diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py index b28536c..de95634 100644 --- a/outrank/task_selftest.py +++ b/outrank/task_selftest.py @@ -16,13 +16,13 @@ logger.setLevel(logging.DEBUG) -def conduct_self_test(): +def conduct_self_test(heuristic='MI-numba-randomized'): # Simulate full flow, ranking only subprocess.run( 'outrank --task data_generator --num_synthetic_rows 100000', shell=True, ) subprocess.run( - 'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;', + f'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --heuristic {heuristic};', shell=True, ) @@ -39,8 +39,10 @@ def conduct_self_test(): logger.info(f'Removing {path} as part of cleanup ..') shutil.rmtree(path) - logger.info('All tests passed, OutRank seems in shape \N{winking face}') + logger.info(f'All tests passed for heuristic: {heuristic} \N{rocket}') if __name__ == '__main__': - conduct_self_test() + conduct_self_test('MI-numba-randomized') + conduct_self_test('max-value-coverage') + logger.info('OutRank seems in shape \N{winking face}') diff --git a/setup.py b/setup.py index 38756fc..f662995 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def _read_description(): packages = [x for x in setuptools.find_packages() if x != 'test'] setuptools.setup( name='outrank', - version='0.96.0', + version='0.96.1', description='OutRank: Feature ranking for massive sparse data sets.', long_description=_read_description(), long_description_content_type='text/markdown', diff --git a/tests/cov_heu_test.py b/tests/cov_heu_test.py new file mode 100644 index 0000000..8f6a465 --- /dev/null +++ b/tests/cov_heu_test.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +import sys +import unittest + +import numpy as np + +from outrank.algorithms.feature_ranking.ranking_cov_alignment import \ + max_pair_coverage + +np.random.seed(123) +sys.path.append('./outrank') + + +class TestMaxPairCoverage(unittest.TestCase): + def test_basic_functionality(self): + array1 = np.array([1, 2, 3, 1, 2]) + array2 = np.array([4, 5, 6, 4, 5]) + result = max_pair_coverage(array1, array2) + self.assertAlmostEqual(result, 2/5, places=5) + + def test_identical_elements(self): + array1 = np.array([1, 1, 1, 1]) + array2 = np.array([1, 1, 1, 1]) + result = max_pair_coverage(array1, array2) + self.assertEqual(result, 1.0) + + def test_large_arrays(self): + array1 = np.random.randint(0, 100, size=10000) + array2 = np.random.randint(0, 100, size=10000) + result = max_pair_coverage(array1, array2) + self.assertTrue(0 <= result <= 1) + + def test_all_unique_pairs(self): + array1 = np.array([1, 2, 3, 4, 5]) + array2 = np.array([6, 7, 8, 9, 10]) + result = max_pair_coverage(array1, array2) + self.assertEqual(result, 1/5) + + def test_all_same_pairs(self): + array1 = np.array([1, 1, 1, 1, 1]) + array2 = np.array([2, 2, 2, 2, 2]) + result = max_pair_coverage(array1, array2) + self.assertEqual(result, 1.0) + + def test_high_collision_potential(self): + array1 = np.array([1] * 1000) + array2 = np.array([2] * 1000) + result = max_pair_coverage(array1, array2) + self.assertEqual(result, 1.0) + + def test_very_large_arrays(self): + array1 = np.random.randint(0, 1000, size=1000000) + array2 = np.random.randint(0, 1000, size=1000000) + result = max_pair_coverage(array1, array2) + self.assertTrue(0 <= result <= 1)