From 78c205bd0b21993c0869a246a65c8cd7e3f06448 Mon Sep 17 00:00:00 2001 From: Miha Jenko <146070429+miha-jenko@users.noreply.github.com> Date: Tue, 24 Oct 2023 15:24:23 +0200 Subject: [PATCH] Optimize memory hog at combination creation + fix autopep8 issue on >= py3.10 (#54) * Optimize combinations creation * fix autopep8@py10 * Fix failing test due to unstable sorting algo * Bump semantic version --- .pre-commit-config.yaml | 1 + outrank/core_ranking.py | 60 +++++++++++++++++++----------------- pyproject.toml | 4 +++ setup.py | 2 +- tests/ranking_module_test.py | 35 ++++++++++++++++++++- 5 files changed, 72 insertions(+), 30 deletions(-) create mode 100644 pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3b98f96..81b0ac6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,7 @@ repos: rev: v2.0.4 hooks: - id: autopep8 + args: ["--global-config pyproject.toml"] - repo: https://github.com/PyCQA/flake8 rev: 6.1.0 hooks: diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index 53954d8..c59293d 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -43,6 +43,7 @@ IGNORED_VALUES = set() HYPERLL_ERROR_BOUND = 0.02 + def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]: """Make sure only relevant subspace of combinations is selected based on prior counts""" @@ -59,6 +60,36 @@ def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> return tmp +def get_combinations_from_columns(all_columns: pd.Index, args: Any) -> list[tuple[Any, ...]]: + """Return feature-feature & feature-label combinations, depending on the heuristic and ranking scope""" + + if '3mr' in args.heuristic: + rel_columns = [column for column in all_columns if ' AND_REL ' in column] + non_rel_columns = sorted(set(all_columns) - set(rel_columns)) + + combinations = list( + itertools.combinations_with_replacement(non_rel_columns, 2), + ) + combinations += [(column, args.label_column) for column in rel_columns] + else: + _combinations = itertools.combinations_with_replacement(all_columns, 2) + + # Some applications do not require the full feature-feature triangular matrix + if args.target_ranking_only == 'True': + combinations = [x for x in _combinations if args.label_column in x] + else: + combinations = list(_combinations) + + if args.target_ranking_only != 'True': + # Diagonal elements (non-label) + combinations += [ + (individual_column, individual_column) + for individual_column in all_columns + if individual_column != args.label_column + ] + return combinations + + def mixed_rank_graph( input_dataframe: pd.DataFrame, args: Any, cpu_pool: Any, pbar: Any, ) -> BatchRankingSummary: @@ -78,34 +109,7 @@ def mixed_rank_graph( end_enc_timer = timer() out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer - # Helper method for parallel estimation - combinations = list( - itertools.combinations_with_replacement(all_columns, 2), - ) - - if '3mr' in args.heuristic: - rel_columns = [ - column for column in all_columns if ' AND_REL ' in column - ] - non_rel_columns = list(set(all_columns) - set(rel_columns)) - combinations = list( - itertools.combinations_with_replacement(non_rel_columns, 2), - ) - combinations += [(column, args.label_column) for column in rel_columns] - else: - combinations = list( - itertools.combinations_with_replacement(all_columns, 2), - ) - - # Diagonal elements - for individual_column in all_columns: - if individual_column != args.label_column: - combinations += [(individual_column, individual_column)] - - # Some applications do not require the full feature-feature triangular matrix - if (args.target_ranking_only == 'True') and ('3mr' not in args.heuristic): - combinations = [x for x in combinations if args.label_column in x] - + combinations = get_combinations_from_columns(all_columns, args) combinations = prior_combinations_sample(combinations, args) random.shuffle(combinations) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..32c197e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.autopep8] +in-place = true +list-fixes = true +ignore = "W690" diff --git a/setup.py b/setup.py index 1cf5d74..fc6fb91 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def _read_description(): packages = [x for x in setuptools.find_packages() if x != 'test'] setuptools.setup( name='outrank', - version='0.95.1', + version='0.95.2', description='OutRank: Feature ranking for massive sparse data sets.', long_description=_read_description(), long_description_content_type='text/markdown', diff --git a/tests/ranking_module_test.py b/tests/ranking_module_test.py index 4461006..5e96fbf 100644 --- a/tests/ranking_module_test.py +++ b/tests/ranking_module_test.py @@ -10,6 +10,7 @@ from pathos.multiprocessing import ProcessingPool as Pool from outrank.core_ranking import compute_combined_features +from outrank.core_ranking import get_combinations_from_columns from outrank.core_ranking import mixed_rank_graph from outrank.feature_transformations.feature_transformer_vault import ( default_transformers, @@ -29,7 +30,7 @@ class args: label_column: str = 'label' heuristic: str = 'surrogate-LR' - target_ranking_only: bool = True + target_ranking_only: str = 'True' interaction_order: int = 3 combination_number_upper_bound: int = 1024 @@ -91,6 +92,38 @@ def test_compute_combinations(self): ) self.assertEqual(transformed_df.shape[1], 6) + def test_get_combinations_from_columns_target_ranking_only(self): + all_columns = pd.Index(['a', 'b', 'label']) + args.heuristic = 'MI-numba-randomized' + args.target_ranking_only = 'True' + combinations = get_combinations_from_columns(all_columns, args) + + self.assertSetEqual( + set(combinations), + {('a', 'label'), ('b', 'label'), ('label', 'label')}, + ) + + def test_get_combinations_from_columns(self): + all_columns = pd.Index(['a', 'b', 'label']) + args.heuristic = 'MI-numba-randomized' + args.target_ranking_only = 'False' + combinations = get_combinations_from_columns(all_columns, args) + + self.assertSetEqual( + set(combinations), + {('a', 'a'), ('b', 'b'), ('label', 'label'), ('a', 'b'), ('a', 'label'), ('b', 'label')}, + ) + + def test_get_combinations_from_columns_3mr(self): + all_columns = pd.Index(['a', 'b', 'label']) + args.heuristic = 'MI-numba-3mr' + combinations = get_combinations_from_columns(all_columns, args) + + self.assertSetEqual( + set(combinations), + {('a', 'a'), ('b', 'b'), ('label', 'label'), ('a', 'b'), ('a', 'label'), ('b', 'label')}, + ) + if __name__ == '__main__': unittest.main()