Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF-1523-Recursive feature ranking/evolution #86

Merged
merged 16 commits into from
Oct 25, 2024
21 changes: 21 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Feature Evolution via Ranking

This script facilitates the process of feature evolution through iterative ranking using the `outrank` tool. It automates the process of running multiple iterations of feature ranking, extracting the best features, and updating the model specifications accordingly.

## Overview

The script performs the following steps:
1. **Initialization**: Sets up the initial model specification directory and creates the initial model JSON file.
2. **Iteration**: Runs the `outrank` task for a specified number of iterations.
3. **Feature Extraction**: Processes the results of each iteration to extract the best feature.
4. **Model Update**: Updates the model specification JSON with the newly identified best feature.

## Prerequisites

- Ensure that the `outrank` tool is installed and accessible from the command line.
- Python 3.6 or higher.
- Required Python packages: `pandas`, `argparse`, `json`, `shutil`, and `logging`.

## Installation

Install the required Python packages using pip (`pip install outrank --upgrade`)
108 changes: 108 additions & 0 deletions examples/recursive_ranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from __future__ import annotations

import argparse
import json
import logging
import os
import shutil
import subprocess

import pandas as pd

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger('syn-logger')

# Configuration constants
DATA_PATH = os.path.expanduser('~/datasets/toy')
miha-jenko marked this conversation as resolved.
Show resolved Hide resolved
MODEL_SPEC_DIR = 'model_spec_dir'
LABEL_COLUMN_NAME = 'label'
HEURISTIC = 'surrogate-SGD-SVD'
DATA_FORMAT = 'ob-vw'
NUM_THREADS = 6
INTERACTION_ORDER = 2
COMBINATION_NUMBER_BOUND = 1_000
MINIBATCH_SIZE = 10_000
SUBSAMPLING = 10

def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
"""Run the outrank task with the specified parameters."""
outrank_command = (
f'outrank --task all --data_path {DATA_PATH} --data_source {DATA_FORMAT} '
f'--target_ranking_only True --combination_number_upper_bound {COMBINATION_NUMBER_BOUND} '
f'--num_threads {NUM_THREADS} --interaction_order {INTERACTION_ORDER} '
f'--output_folder {output_folder} --reference_model_JSON {reference_model_json} '
f'--heuristic {HEURISTIC} --label_column {LABEL_COLUMN_NAME} '
f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm False;'
)
logger.info(f'Running outrank command: {outrank_command}')
subprocess.run(outrank_command, shell=True, check=True)
logger.info(f'Outrank task completed for {reference_model_json}')

def process_results(output_folder: str) -> str:
"""Read the results and extract the best feature."""
results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t')
best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1])
best_feature = ','.join(best_feature.split(' AND '))
logger.info(f'Best feature: {best_feature}')
return best_feature

def update_model_spec(model_index: int, best_feature: str) -> None:
"""Update the model specification JSON with the new best feature."""
current_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index}.json')
next_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index + 1}.json')

with open(current_model_path) as file:
model_spec = json.load(file)

current_features = model_spec['desc']['features']
current_features.append(best_feature)
logger.info(f'Updated features: {current_features}')

with open(next_model_path, 'w') as file:
new_model_spec = {'desc': {'features': current_features}}
json.dump(new_model_spec, file)

def initialize_model_spec_dir() -> None:
"""Initialize the model specification directory with the initial JSON file."""
command = (
'mkdir -p model_spec_dir && '
'rm -rv model_spec_dir/* && '
'echo \'{"desc": {"features": []}}\' > ./model_spec_dir/model_0.json'
)
subprocess.run(command, shell=True, check=True)
logger.info('Initialized model specification directory with model_0.json')

def run_evolution(iterations: int) -> None:
"""Main function to run the test for multiple iterations."""
for i in range(iterations):
reference_model_json = os.path.join(MODEL_SPEC_DIR, f'model_{i}.json')
output_folder = f'output_dir_{i}'

if os.path.isdir(output_folder):
shutil.rmtree(output_folder)
os.mkdir(output_folder)

try:
run_outrank_task(reference_model_json, output_folder)
best_feature = process_results(output_folder)
update_model_spec(i, best_feature)
except Exception as e:
logger.error(f'An error occurred during iteration {i}: {e}')
continue
miha-jenko marked this conversation as resolved.
Show resolved Hide resolved

def parse_arguments() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description='Run the outrank evolution process.')
parser.add_argument(
'--iterations',
type=int,
default=80,
help='Number of iterations to run (default: 10)',
)
return parser.parse_args()

if __name__ == '__main__':
args = parse_arguments()
initialize_model_spec_dir()
run_evolution(args.iterations)
121 changes: 81 additions & 40 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,29 @@
import logging
import operator
import traceback
from typing import Any, Dict, List, Tuple
from typing import Any

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

from outrank.algorithms.feature_ranking import ranking_cov_alignment
from outrank.core_utils import is_prior_heuristic

logger = logging.getLogger('syn-logger')
logger.setLevel(logging.DEBUG)

num_folds = 4
NUM_FOLDS = 2
SVD_DIMS = 2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why 2?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a default for now, was fast.


try:
from outrank.algorithms.feature_ranking import ranking_mi_numba
Expand All @@ -32,29 +36,33 @@

def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
return mutual_info_classif(
vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True
vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True,
)[0]

def sklearn_surrogate(
vector_first: np.ndarray, vector_second: np.ndarray, X: np.ndarray, surrogate_model: str
vector_first: np.ndarray, vector_second: np.ndarray, surrogate_model: str,
) -> float:
clf = initialize_classifier(surrogate_model)
transf = OneHotEncoder()
X = OneHotEncoder().fit_transform(vector_first)

if len(np.unique(vector_second)) > 2:
vector_first, vector_second = vector_second, vector_first
if '-SVD' in surrogate_model and X.shape[1] > 2:
# yes this is not super correct due to embedding full data first, but it's much faster + seems to offer same results anyways.
X = TruncatedSVD(n_components=min(SVD_DIMS, X.shape[1])).fit_transform(X)

if X.size <= 1:
X = vector_first.reshape(-1, 1)
else:
X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)

X = transf.fit_transform(X)
scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds)
clf = initialize_classifier(surrogate_model, n_dim=min(X.shape[1], 1024))
scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=NUM_FOLDS)
return 1 + np.median(scores)

def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
cardinality_correction = heuristic == 'MI-numba-randomized'

try:
if vector_first.shape[1] == 1:
vector_first = vector_first.reshape(-1)
else:
vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1)
except:
logger.warning('Reshaping for MI computation in place - you are considering many-one mapping')

return ranking_mi_numba.mutual_info_estimator_numba(
vector_first.astype(np.int32),
vector_second.astype(np.int32),
Expand All @@ -65,43 +73,68 @@ def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str
def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
return adjusted_mutual_info_score(vector_first, vector_second)

def get_importances_estimate_pairwise(combination: Tuple[str, str], reference_model_features: List[str], args: Any, tmp_df: pd.DataFrame) -> Tuple[str, str, float]:
def generate_data_for_ranking(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple(np.ndarray, np.ndrray):
feature_one, feature_two = combination

if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns:
logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.')
return feature_one, feature_two, 0.0
if feature_one == args.label_column:
feature_one = feature_two
feature_two = args.label_column

if args.reference_model_JSON:
vector_first = tmp_df[list(reference_model_features) + [feature_one]].values
else:
vector_first = tmp_df[feature_one].values

vector_first = tmp_df[feature_one].values
vector_second = tmp_df[feature_two].values
return vector_first, vector_second


if vector_first.size == 0 or vector_second.size == 0:
return feature_one, feature_two, 0.0
def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray, args: Any) -> float:

if args.heuristic == 'MI':
heuristic = args.heuristic
score = 0.0

if heuristic == 'MI':
score = sklearn_MI(vector_first, vector_second)
elif 'surrogate-' in args.heuristic:
X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([])
score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic)
elif 'max-value-coverage' in args.heuristic:

elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-RP', 'surrogate-SGD-SVD'}:
score = sklearn_surrogate(vector_first, vector_second, heuristic)

elif heuristic == 'max-value-coverage':
score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
elif 'MI-numba' in args.heuristic:
score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio)
elif args.heuristic == 'AMI':

elif heuristic == 'MI-numba-randomized':
score = numba_mi(vector_first, vector_second, heuristic, args.mi_stratified_sampling_ratio)

elif heuristic == 'AMI':
score = sklearn_mi_adj(vector_first, vector_second)
elif args.heuristic == 'correlation-Pearson':

elif heuristic == 'correlation-Pearson':
score = pearsonr(vector_first, vector_second)[0]
elif args.heuristic == 'Constant':

elif heuristic == 'Constant':
score = 0.0

else:
raise ValueError('Please select a valid heuristic (MI, chi2, etc.).')
logger.warning(f'{heuristic} not defined!')
score = 0.0

return score

def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]:

feature_one, feature_two = combination
inputs_encoded, output_encoded = generate_data_for_ranking(combination, reference_model_features, args, tmp_df)

ranking_score = conduct_feature_ranking(inputs_encoded, output_encoded, args)

return feature_one, feature_two, ranking_score

return feature_one, feature_two, score

def rank_features_3MR(
relevance_dict: Dict[str, float],
redundancy_dict: Dict[Tuple[Any, Any], Any],
relational_dict: Dict[Tuple[Any, Any], Any],
relevance_dict: dict[str, float],
redundancy_dict: dict[tuple[Any, Any], Any],
relational_dict: dict[tuple[Any, Any], Any],
strategy: str = 'median',
alpha: float = 1.0,
beta: float = 1.0,
Expand Down Expand Up @@ -141,13 +174,21 @@ def calc_higher_order(feature: str, is_redundancy: bool = True) -> float:
def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
pass

def initialize_classifier(surrogate_model: str):
def initialize_classifier(surrogate_model: str, n_dim: int) -> Any:

if 'surrogate-LR' in surrogate_model:
return LogisticRegression(max_iter=100000)

elif 'surrogate-SVM' in surrogate_model:
return SVC(gamma='auto', probability=True)

elif 'surrogate-SGD-RP' in surrogate_model:
clf = Pipeline([('proj', random_projection.SparseRandomProjection(n_components=n_dim)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))])
return clf

elif 'surrogate-SGD' in surrogate_model:
return SGDClassifier(max_iter=100000, loss='log_loss')

else:
logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
return SGDClassifier(max_iter=100000, loss='log_loss')
1 change: 0 additions & 1 deletion outrank/algorithms/sketches/counting_cms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import sys
from collections import Counter

import numpy as np
Expand Down
1 change: 0 additions & 1 deletion outrank/algorithms/sketches/counting_counters_ordinary.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def add(self, val):

depth = 8
width = 2**22
import numpy as np
cms = PrimitiveConstrainedCounter()

items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 10000
Expand Down
6 changes: 0 additions & 6 deletions outrank/algorithms/sketches/counting_ultiloglog.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,7 @@ def __len__(self):
if __name__ == '__main__':
import random
import string
import time

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tqdm
from pympler import asizeof

def get_random_string(length):
# choose from all lowercase letter
Expand Down
5 changes: 1 addition & 4 deletions outrank/algorithms/synthetic_data_generators/cc_generator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from __future__ import annotations

from typing import List
from typing import Literal
from typing import Optional
from typing import Tuple
from typing import Union

import numpy as np
from numpy.typing import ArrayLike
Expand All @@ -28,7 +25,7 @@ def __init__(self, seed: int = 42):
}

def __repr__(self):
return f"CategoricalClassification(dataset_info={self.dataset_info})"
return f'CategoricalClassification(dataset_info={self.dataset_info})'

def generate_data(
self,
Expand Down
Loading
Loading