diff --git a/examples/run_ranking_prior.sh b/examples/run_ranking_prior.sh new file mode 100644 index 0000000..4421d0a --- /dev/null +++ b/examples/run_ranking_prior.sh @@ -0,0 +1,21 @@ +########################################################################################################## +# A very generic OutRank invocation (default). It includes visualizations and other relevant statistics. # +########################################################################################################## + +# This run compares features "one-at-a-time" and summarizes, visualizes the outputs. +# hint - if unsure what parameters do, you can always run "outrank --help" + +outrank \ + --task all \ + --data_path $PATH_TO_YOUR_DATA \ + --data_source ob-csv \ + --heuristic surrogate-SGD-prior \ + --target_ranking_only True \ + --interaction_order 2 \ + --combination_number_upper_bound 2048 \ + --num_threads 12 \ + --output_folder ./some_output_folder \ + --subsampling 100 \ + --minibatch_size 10000 \ + --label_column info_click_valid \ + --reference_model_JSON $PATH_TO_YOUR_REFERENCE_MODEL diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index a228c4a..e37ba1c 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -11,15 +11,20 @@ import pandas as pd from scipy.stats import pearsonr from sklearn.feature_selection import mutual_info_classif -from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.metrics import adjusted_mutual_info_score from sklearn.model_selection import cross_val_score from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC +from outrank.core_utils import is_prior_heuristic + + logger = logging.getLogger('syn-logger') logger.setLevel(logging.DEBUG) +num_folds = 4 + try: from outrank.algorithms.feature_ranking import ranking_mi_numba @@ -38,13 +43,11 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float: def sklearn_surrogate( - vector_first: Any, vector_second: Any, surrogate_model: str, + vector_first: Any, vector_second: Any, X: Any, surrogate_model: str ) -> float: - if surrogate_model == 'surrogate-LR': - clf = LogisticRegression(max_iter=100000) - elif surrogate_model == 'surrogate-SVM': - clf = SVC(gamma='auto', probability=True) - + + clf = initialize_classifier(surrogate_model) + transf = OneHotEncoder() # They do not commute, swap if needed @@ -54,20 +57,17 @@ def sklearn_surrogate( vector_first = vector_third del vector_third - unique_values, counts = np.unique(vector_second, return_counts=True) - - # Establish min support for this type of ranking. - if counts[0] < len(unique_values) * (2**5): - estimate_feature_importance = 0 - + if X.size <= 1: + X = vector_first.reshape(-1, 1) else: - vector_first = transf.fit_transform(vector_first.reshape(-1, 1)) - estimate_feature_importance_list = cross_val_score( - clf, vector_first, vector_second, scoring='neg_log_loss', cv=4, - ) + X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1) - estimate_feature_importance = 1 + \ - np.median(estimate_feature_importance_list) + X = transf.fit_transform(X) + estimate_feature_importance_list = cross_val_score( + clf, X, vector_second, scoring='neg_log_loss', cv=num_folds, + ) + estimate_feature_importance = 1 + \ + np.median(estimate_feature_importance_list) return estimate_feature_importance @@ -97,7 +97,7 @@ def sklearn_mi_adj(vector_first, vector_second): return estimate_feature_importance -def get_importances_estimate_pairwise(combination, args, tmp_df): +def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df): """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel.""" feature_one = combination[0] @@ -122,8 +122,12 @@ def get_importances_estimate_pairwise(combination, args, tmp_df): estimate_feature_importance = sklearn_MI(vector_first, vector_second) elif 'surrogate-' in args.heuristic: + X = np.array(float) + if is_prior_heuristic(args) and (len(reference_model_features) > 0): + X = tmp_df[reference_model_features].values + estimate_feature_importance = sklearn_surrogate( - vector_first, vector_second, args.heuristic, + vector_first, vector_second, X, args.heuristic ) elif 'MI-numba' in args.heuristic: @@ -213,3 +217,15 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame): # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label # TODO - this is to be executed directly on df - no need for parallel kernel(s) pass + + +def initialize_classifier(surrogate_model: str): + if 'surrogate-LR' in surrogate_model: + return LogisticRegression(max_iter=100000) + elif 'surrogate-SVM' in surrogate_model: + return SVC(gamma='auto', probability=True) + elif 'surrogate-SGD' in surrogate_model: + return SGDClassifier(max_iter=100000, loss='log_loss') + else: + logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD') + return SGDClassifier(max_iter=100000, loss='log_loss') diff --git a/outrank/algorithms/synthetic_data_generators/generator_naive.py b/outrank/algorithms/synthetic_data_generators/generator_naive.py index 23c5f8a..7404360 100644 --- a/outrank/algorithms/synthetic_data_generators/generator_naive.py +++ b/outrank/algorithms/synthetic_data_generators/generator_naive.py @@ -13,7 +13,8 @@ def generate_random_matrix(num_features=100, size=20000): target = sample[:, 30] # Some noise - target[target < 20] = 0 + target[target < 40] = 0 + target[target > 39] = 1 return sample, target diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index 39843f7..30f892c 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -32,6 +32,7 @@ from outrank.core_utils import internal_hash from outrank.core_utils import NominalFeatureSummary from outrank.core_utils import NumericFeatureSummary +from outrank.core_utils import is_prior_heuristic from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise @@ -50,12 +51,15 @@ def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]: """Make sure only relevant subspace of combinations is selected based on prior counts""" - if len(GLOBAL_PRIOR_COMB_COUNTS) == 0: - for combination in combinations: - GLOBAL_PRIOR_COMB_COUNTS[combination] += 1 - tmp = combinations[:args.combination_number_upper_bound] - else: - tmp = list(x[0] for x in sorted(GLOBAL_PRIOR_COMB_COUNTS.items(), key=lambda x:x[1], reverse=False))[:args.combination_number_upper_bound] + if len(combinations) == 0: + return [] + + missing_combinations = set(set(combinations)).difference(GLOBAL_PRIOR_COMB_COUNTS.keys()) + if len(missing_combinations) > 0: + for combination in missing_combinations: + GLOBAL_PRIOR_COMB_COUNTS[combination] = 0 + + tmp = sorted(combinations, key=GLOBAL_PRIOR_COMB_COUNTS.get, reverse=False)[:args.combination_number_upper_bound] for combination in tmp: GLOBAL_PRIOR_COMB_COUNTS[combination] += 1 @@ -115,6 +119,12 @@ def mixed_rank_graph( out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer combinations = get_combinations_from_columns(all_columns, args) + + reference_model_features = {} + if is_prior_heuristic(args): + reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)] + combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features] + combinations = prior_combinations_sample(combinations, args) random.shuffle(combinations) @@ -132,7 +142,7 @@ def mixed_rank_graph( # starmap is an alternative that is slower unfortunately (but nicer) def get_grounded_importances_estimate(combination: tuple[str]) -> Any: - return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df) + return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df) start_enc_timer = timer() with cpu_pool as p: @@ -176,7 +186,6 @@ def enrich_with_transformations( def compute_combined_features( input_dataframe: pd.DataFrame, - logger: Any, args: Any, pbar: Any, is_3mr: bool = False, @@ -189,19 +198,25 @@ def compute_combined_features( join_string = ' AND_REL ' if is_3mr else ' AND ' interaction_order = 2 if is_3mr else args.interaction_order + model_combinations = [] + full_combination_space = [] + + + if args.interaction_order > 1: + full_combination_space = list( + itertools.combinations(all_columns, interaction_order), + ) + full_combination_space = prior_combinations_sample(full_combination_space, args) + if args.reference_model_JSON != '': - combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True) - full_combination_space = [combination.split(',') for combination in combined_features] - else: - full_combination_space = list( - itertools.combinations(all_columns, interaction_order), - ) + model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True) + model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations] + if not is_prior_heuristic(args): + full_combination_space = model_combinations + + if is_prior_heuristic(args): + full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space] - if args.combination_number_upper_bound and args.reference_model_JSON != '': - random.shuffle(full_combination_space) - full_combination_space = full_combination_space[ - : args.combination_number_upper_bound - ] com_counter = 0 new_feature_hash = {} @@ -531,7 +546,7 @@ def compute_batch_ranking( if args.interaction_order > 1 or args.reference_model_JSON: pbar.set_description('Constructing new features') input_dataframe = compute_combined_features( - input_dataframe, logger, args, pbar, + input_dataframe, args, pbar, ) # in case of 3mr we compute the score of combinations against the target @@ -540,7 +555,7 @@ def compute_batch_ranking( 'Constructing features for computing relations in 3mr', ) input_dataframe = compute_combined_features( - input_dataframe, logger, args, pbar, True, + input_dataframe, args, pbar, True, ) if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant': diff --git a/outrank/core_utils.py b/outrank/core_utils.py index 0136d42..336cc35 100644 --- a/outrank/core_utils.py +++ b/outrank/core_utils.py @@ -393,7 +393,7 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage: ) -def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]: +def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, all_features = False) -> set[Any]: """Given a model's JSON, extract unique features""" with open(json_path) as jp: @@ -401,6 +401,9 @@ def extract_features_from_reference_JSON(json_path: str, combined_features_only unique_features = set() feature_space = content['desc'].get('features', []) + if all_features: + return set(feature_space) + fields_space = content['desc'].get('fields', []) joint_space = feature_space + fields_space @@ -641,3 +644,10 @@ def summarize_rare_counts( final_df.to_csv( f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t', ) + + +def is_prior_heuristic(args: Any) -> bool: + if "-prior" in args.heuristic and args.reference_model_JSON: + return True + return False + diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py index 9b5ff6f..cd1cb45 100644 --- a/outrank/task_selftest.py +++ b/outrank/task_selftest.py @@ -1,6 +1,5 @@ # helper set of methods that enable anywhere verification of core functions from __future__ import annotations - import logging import os import shutil @@ -22,16 +21,16 @@ def conduct_self_test(): 'outrank --task data_generator --num_synthetic_rows 100000', shell=True, ) subprocess.run( - 'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;', + 'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;', shell=True, ) dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t') logger.info("Verifying output's properties ..") - assert dfx.shape[0] == 120 + assert dfx.shape[0] == 201 assert dfx.shape[1] == 3 - assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)' + assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)' to_remove = ['ranking_outputs', 'test_data_synthetic'] for path in to_remove: @@ -40,3 +39,7 @@ def conduct_self_test(): shutil.rmtree(path) logger.info('All tests passed, OutRank seems in shape \N{winking face}') + + +if __name__ == '__main__': + conduct_self_test() diff --git a/outrank/task_summary.py b/outrank/task_summary.py index 38475d9..458c9b3 100644 --- a/outrank/task_summary.py +++ b/outrank/task_summary.py @@ -37,9 +37,10 @@ def outrank_task_result_summary(args): min_score = np.min(final_df[f'Score {args.heuristic}'].values) max_score = np.max(final_df[f'Score {args.heuristic}'].values) - final_df[f'Score {args.heuristic}'] = ( - final_df[f'Score {args.heuristic}'] - min_score - ) / (max_score - min_score) + if "MI" in args.heuristic: + final_df[f'Score {args.heuristic}'] = ( + final_df[f'Score {args.heuristic}'] - min_score + ) / (max_score - min_score) logging.info(f'Storing summary files to {args.output_folder}') pd.set_option('display.max_rows', None, 'display.max_columns', None) singles_path = os.path.join(args.output_folder, 'feature_singles.tsv') diff --git a/tests/ranking_module_test.py b/tests/ranking_module_test.py index e49880c..fd99092 100644 --- a/tests/ranking_module_test.py +++ b/tests/ranking_module_test.py @@ -82,7 +82,7 @@ def test_compute_combinations(self): random_df.columns = ['F1', 'F2', 'F3'] local_pbar = tqdm.tqdm(total=100, position=0) transformed_df = compute_combined_features( - random_df, None, args, local_pbar, + random_df, args, local_pbar, ) self.assertEqual(transformed_df.shape[1], 4) @@ -91,7 +91,7 @@ def test_compute_combinations(self): random_df = pd.DataFrame(random_matrix) random_df.columns = ['F1', 'F2', 'F3'] transformed_df = compute_combined_features( - random_df, None, args, local_pbar, + random_df, args, local_pbar, ) self.assertEqual(transformed_df.shape[1], 6) diff --git a/tests/test_ref_model.json b/tests/test_ref_model.json new file mode 100644 index 0000000..6c36715 --- /dev/null +++ b/tests/test_ref_model.json @@ -0,0 +1,5 @@ +{ + "desc": { + "features": ["f0","f1","f0,f1"] + } +} \ No newline at end of file