diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py index 7e076c6..2ffe820 100644 --- a/examples/recursive_ranking.py +++ b/examples/recursive_ranking.py @@ -18,12 +18,12 @@ DATA_PATH = os.path.expanduser('~/datasets/toy') MODEL_SPEC_DIR = 'model_spec_dir' LABEL_COLUMN_NAME = 'label' -HEURISTIC = 'surrogate-SGD-prior' +HEURISTIC = 'MI-numba-randomized' DATA_FORMAT = 'ob-vw' NUM_THREADS = 6 INTERACTION_ORDER = 2 -COMBINATION_NUMBER_BOUND = 300 -MINIBATCH_SIZE = 30_000 +COMBINATION_NUMBER_BOUND = 1_000 +MINIBATCH_SIZE = 10_000 SUBSAMPLING = 1 def run_outrank_task(reference_model_json: str, output_folder: str) -> None: @@ -43,7 +43,6 @@ def run_outrank_task(reference_model_json: str, output_folder: str) -> None: def process_results(output_folder: str) -> str: """Read the results and extract the best feature.""" results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t') - logger.info(f'Results head:\n{results.head(5)}') best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1]) best_feature = ','.join(best_feature.split(' AND ')) logger.info(f'Best feature: {best_feature}') diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index f9e8241..18f2a7d 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -64,6 +64,12 @@ def sklearn_surrogate( def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float: cardinality_correction = heuristic == 'MI-numba-randomized' + + if vector_first.shape[1] == 1: + vector_first = vector_first.reshape(-1) + else: + vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1) + return ranking_mi_numba.mutual_info_estimator_numba( vector_first.astype(np.int32), vector_second.astype(np.int32), @@ -74,38 +80,64 @@ def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float: return adjusted_mutual_info_score(vector_first, vector_second) -def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]: +def generate_data_for_ranking(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple(np.ndarray, np.ndrray): feature_one, feature_two = combination - if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns: - logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.') - return feature_one, feature_two, 0.0 + if feature_one == args.label_column: + feature_one = feature_two + feature_two = args.label_column + + if args.reference_model_JSON != '' and args.reference_model_JSON is not None: + vector_first = tmp_df[list(reference_model_features) + [feature_one]].values + else: + vector_first = tmp_df[feature_one].values - vector_first = tmp_df[feature_one].values vector_second = tmp_df[feature_two].values + return vector_first, vector_second - if vector_first.size == 0 or vector_second.size == 0: - return feature_one, feature_two, 0.0 - if args.heuristic == 'MI': +def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray, args: Any) -> float: + + heuristic = args.heuristic + score = 0.0 + + if heuristic == 'MI': score = sklearn_MI(vector_first, vector_second) - elif 'surrogate-' in args.heuristic: - X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([]) - score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic, is_target=True if feature_two == 'label' else False) - elif 'max-value-coverage' in args.heuristic: + + elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-prior'}: + logger.warning('surrogate-based models currently not available .. Try a MI-based one (e.g., MI-numba-randomized).') + score = 0.0 + + elif heuristic == 'max-value-coverage': score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second) - elif 'MI-numba' in args.heuristic: - score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio) - elif args.heuristic == 'AMI': + + elif heuristic == 'MI-numba-randomized': + score = numba_mi(vector_first, vector_second, heuristic, args.mi_stratified_sampling_ratio) + + elif heuristic == 'AMI': score = sklearn_mi_adj(vector_first, vector_second) - elif args.heuristic == 'correlation-Pearson': + + elif heuristic == 'correlation-Pearson': score = pearsonr(vector_first, vector_second)[0] - elif args.heuristic == 'Constant': + + elif heuristic == 'Constant': score = 0.0 + else: - raise ValueError('Please select a valid heuristic (MI, chi2, etc.).') + logger.warning(f'{heuristic} not defined!') + score = 0.0 + + return score + +def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]: + + feature_one, feature_two = combination + inputs_encoded, output_encoded = generate_data_for_ranking(combination, reference_model_features, args, tmp_df) + + ranking_score = conduct_feature_ranking(inputs_encoded, output_encoded, args) + + return feature_one, feature_two, ranking_score - return feature_one, feature_two, score def rank_features_3MR( relevance_dict: dict[str, float], diff --git a/outrank/core_utils.py b/outrank/core_utils.py index 1be4993..b50cc7c 100644 --- a/outrank/core_utils.py +++ b/outrank/core_utils.py @@ -647,7 +647,7 @@ def summarize_rare_counts( def is_prior_heuristic(args: Any) -> bool: - if '-prior' in args.heuristic and args.reference_model_JSON: + if 'MI-numba-randomized' and args.reference_model_JSON: return True return False