Skip to content

Commit

Permalink
randomized heuristic
Browse files Browse the repository at this point in the history
  • Loading branch information
SkBlaz committed Oct 24, 2024
1 parent 8664d87 commit b60b5b0
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 24 deletions.
7 changes: 3 additions & 4 deletions examples/recursive_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
DATA_PATH = os.path.expanduser('~/datasets/toy')
MODEL_SPEC_DIR = 'model_spec_dir'
LABEL_COLUMN_NAME = 'label'
HEURISTIC = 'surrogate-SGD-prior'
HEURISTIC = 'MI-numba-randomized'
DATA_FORMAT = 'ob-vw'
NUM_THREADS = 6
INTERACTION_ORDER = 2
COMBINATION_NUMBER_BOUND = 300
MINIBATCH_SIZE = 30_000
COMBINATION_NUMBER_BOUND = 1_000
MINIBATCH_SIZE = 10_000
SUBSAMPLING = 1

def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
Expand All @@ -43,7 +43,6 @@ def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
def process_results(output_folder: str) -> str:
"""Read the results and extract the best feature."""
results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t')
logger.info(f'Results head:\n{results.head(5)}')
best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1])
best_feature = ','.join(best_feature.split(' AND '))
logger.info(f'Best feature: {best_feature}')
Expand Down
70 changes: 51 additions & 19 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ def sklearn_surrogate(

def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
cardinality_correction = heuristic == 'MI-numba-randomized'

if vector_first.shape[1] == 1:
vector_first = vector_first.reshape(-1)
else:
vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1)

return ranking_mi_numba.mutual_info_estimator_numba(
vector_first.astype(np.int32),
vector_second.astype(np.int32),
Expand All @@ -74,38 +80,64 @@ def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str
def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
return adjusted_mutual_info_score(vector_first, vector_second)

def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]:
def generate_data_for_ranking(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple(np.ndarray, np.ndrray):
feature_one, feature_two = combination

if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns:
logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.')
return feature_one, feature_two, 0.0
if feature_one == args.label_column:
feature_one = feature_two
feature_two = args.label_column

if args.reference_model_JSON != '' and args.reference_model_JSON is not None:
vector_first = tmp_df[list(reference_model_features) + [feature_one]].values
else:
vector_first = tmp_df[feature_one].values

vector_first = tmp_df[feature_one].values
vector_second = tmp_df[feature_two].values
return vector_first, vector_second

if vector_first.size == 0 or vector_second.size == 0:
return feature_one, feature_two, 0.0

if args.heuristic == 'MI':
def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray, args: Any) -> float:

heuristic = args.heuristic
score = 0.0

if heuristic == 'MI':
score = sklearn_MI(vector_first, vector_second)
elif 'surrogate-' in args.heuristic:
X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([])
score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic, is_target=True if feature_two == 'label' else False)
elif 'max-value-coverage' in args.heuristic:

elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-prior'}:
logger.warning('surrogate-based models currently not available .. Try a MI-based one (e.g., MI-numba-randomized).')
score = 0.0

elif heuristic == 'max-value-coverage':
score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
elif 'MI-numba' in args.heuristic:
score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio)
elif args.heuristic == 'AMI':

elif heuristic == 'MI-numba-randomized':
score = numba_mi(vector_first, vector_second, heuristic, args.mi_stratified_sampling_ratio)

elif heuristic == 'AMI':
score = sklearn_mi_adj(vector_first, vector_second)
elif args.heuristic == 'correlation-Pearson':

elif heuristic == 'correlation-Pearson':
score = pearsonr(vector_first, vector_second)[0]
elif args.heuristic == 'Constant':

elif heuristic == 'Constant':
score = 0.0

else:
raise ValueError('Please select a valid heuristic (MI, chi2, etc.).')
logger.warning(f'{heuristic} not defined!')
score = 0.0

return score

def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]:

feature_one, feature_two = combination
inputs_encoded, output_encoded = generate_data_for_ranking(combination, reference_model_features, args, tmp_df)

ranking_score = conduct_feature_ranking(inputs_encoded, output_encoded, args)

return feature_one, feature_two, ranking_score

return feature_one, feature_two, score

def rank_features_3MR(
relevance_dict: dict[str, float],
Expand Down
2 changes: 1 addition & 1 deletion outrank/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ def summarize_rare_counts(


def is_prior_heuristic(args: Any) -> bool:
if '-prior' in args.heuristic and args.reference_model_JSON:
if 'MI-numba-randomized' and args.reference_model_JSON:
return True
return False

Expand Down

0 comments on commit b60b5b0

Please sign in to comment.