REF-1523-Recursive feature ranking/evolution (#86)

* tldr mode * tldr * docs * randomized heuristic * le tests * some refactoring * Some refactoring * Some imports * Imports * ruff * le import * nonsense-- * Minor fixes * svd works too * version
outbrain-inc · Oct 25, 2024 · 74b042c · 74b042c
1 parent b5f4a2c
commit 74b042c
Show file tree

Hide file tree

Showing 16 changed files with 241 additions and 99 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,21 @@
+# Feature Evolution via Ranking
+
+This script facilitates the process of feature evolution through iterative ranking using the `outrank` tool. It automates the process of running multiple iterations of feature ranking, extracting the best features, and updating the model specifications accordingly.
+
+## Overview
+
+The script performs the following steps:
+1. **Initialization**: Sets up the initial model specification directory and creates the initial model JSON file.
+2. **Iteration**: Runs the `outrank` task for a specified number of iterations.
+3. **Feature Extraction**: Processes the results of each iteration to extract the best feature.
+4. **Model Update**: Updates the model specification JSON with the newly identified best feature.
+
+## Prerequisites
+
+- Ensure that the `outrank` tool is installed and accessible from the command line.
+- Python 3.6 or higher.
+- Required Python packages: `pandas`, `argparse`, `json`, `shutil`, and `logging`.
+
+## Installation
+
+Install the required Python packages using pip (`pip install outrank --upgrade`)
diff --git a/examples/recursive_ranking.py b/examples/recursive_ranking.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import shutil
+import subprocess
+
+import pandas as pd
+
+# Configure logging
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
+logger = logging.getLogger('syn-logger')
+
+# Configuration constants
+DATA_PATH = os.path.expanduser('~/datasets/toy')
+MODEL_SPEC_DIR = 'model_spec_dir'
+LABEL_COLUMN_NAME = 'label'
+HEURISTIC = 'surrogate-SGD-SVD'
+DATA_FORMAT = 'ob-vw'
+NUM_THREADS = 6
+INTERACTION_ORDER = 2
+COMBINATION_NUMBER_BOUND = 1_000
+MINIBATCH_SIZE = 10_000
+SUBSAMPLING = 10
+
+def run_outrank_task(reference_model_json: str, output_folder: str) -> None:
+    """Run the outrank task with the specified parameters."""
+    outrank_command = (
+        f'outrank --task all --data_path {DATA_PATH} --data_source {DATA_FORMAT} '
+        f'--target_ranking_only True --combination_number_upper_bound {COMBINATION_NUMBER_BOUND} '
+        f'--num_threads {NUM_THREADS} --interaction_order {INTERACTION_ORDER} '
+        f'--output_folder {output_folder} --reference_model_JSON {reference_model_json} '
+        f'--heuristic {HEURISTIC} --label_column {LABEL_COLUMN_NAME} '
+        f'--subsampling {SUBSAMPLING} --minibatch_size {MINIBATCH_SIZE} --disable_tqdm False;'
+    )
+    logger.info(f'Running outrank command: {outrank_command}')
+    subprocess.run(outrank_command, shell=True, check=True)
+    logger.info(f'Outrank task completed for {reference_model_json}')
+
+def process_results(output_folder: str) -> str:
+    """Read the results and extract the best feature."""
+    results = pd.read_csv(os.path.join(output_folder, 'feature_singles.tsv'), delimiter='\t')
+    best_feature = '-'.join(results.Feature.iloc[1].split('-')[:-1])
+    best_feature = ','.join(best_feature.split(' AND '))
+    logger.info(f'Best feature: {best_feature}')
+    return best_feature
+
+def update_model_spec(model_index: int, best_feature: str) -> None:
+    """Update the model specification JSON with the new best feature."""
+    current_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index}.json')
+    next_model_path = os.path.join(MODEL_SPEC_DIR, f'model_{model_index + 1}.json')
+
+    with open(current_model_path) as file:
+        model_spec = json.load(file)
+
+    current_features = model_spec['desc']['features']
+    current_features.append(best_feature)
+    logger.info(f'Updated features: {current_features}')
+
+    with open(next_model_path, 'w') as file:
+        new_model_spec = {'desc': {'features': current_features}}
+        json.dump(new_model_spec, file)
+
+def initialize_model_spec_dir() -> None:
+    """Initialize the model specification directory with the initial JSON file."""
+    command = (
+        'mkdir -p model_spec_dir && '
+        'rm -rv model_spec_dir/* && '
+        'echo \'{"desc": {"features": []}}\' > ./model_spec_dir/model_0.json'
+    )
+    subprocess.run(command, shell=True, check=True)
+    logger.info('Initialized model specification directory with model_0.json')
+
+def run_evolution(iterations: int) -> None:
+    """Main function to run the test for multiple iterations."""
+    for i in range(iterations):
+        reference_model_json = os.path.join(MODEL_SPEC_DIR, f'model_{i}.json')
+        output_folder = f'output_dir_{i}'
+
+        if os.path.isdir(output_folder):
+            shutil.rmtree(output_folder)
+        os.mkdir(output_folder)
+
+        try:
+            run_outrank_task(reference_model_json, output_folder)
+            best_feature = process_results(output_folder)
+            update_model_spec(i, best_feature)
+        except Exception as e:
+            logger.error(f'An error occurred during iteration {i}: {e}')
+            continue
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description='Run the outrank evolution process.')
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=80,
+        help='Number of iterations to run (default: 10)',
+    )
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    initialize_model_spec_dir()
+    run_evolution(args.iterations)
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
@@ -3,25 +3,29 @@
 import logging
 import operator
 import traceback
-from typing import Any, Dict, List, Tuple
+from typing import Any
 
 import numpy as np
 import pandas as pd
 from scipy.stats import pearsonr
+from sklearn import random_projection
+from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_selection import mutual_info_classif
-from sklearn.linear_model import LogisticRegression, SGDClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import SGDClassifier
 from sklearn.metrics import adjusted_mutual_info_score
 from sklearn.model_selection import cross_val_score
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.svm import SVC
 
 from outrank.algorithms.feature_ranking import ranking_cov_alignment
-from outrank.core_utils import is_prior_heuristic
 
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
 
-num_folds = 4
+NUM_FOLDS  = 2
+SVD_DIMS = 2
 
 try:
     from outrank.algorithms.feature_ranking import ranking_mi_numba
@@ -32,29 +36,33 @@
 
 def sklearn_MI(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
     return mutual_info_classif(
-        vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True
+        vector_first.reshape(-1, 1), vector_second.reshape(-1), discrete_features=True,
     )[0]
 
 def sklearn_surrogate(
-    vector_first: np.ndarray, vector_second: np.ndarray, X: np.ndarray, surrogate_model: str
+    vector_first: np.ndarray, vector_second: np.ndarray,  surrogate_model: str,
 ) -> float:
-    clf = initialize_classifier(surrogate_model)
-    transf = OneHotEncoder()
+    X = OneHotEncoder().fit_transform(vector_first)
 
-    if len(np.unique(vector_second)) > 2:
-        vector_first, vector_second = vector_second, vector_first
+    if '-SVD' in surrogate_model and X.shape[1] > 2:
+        # yes this is not super correct due to embedding full data first, but it's much faster + seems to offer same results anyways.
+        X = TruncatedSVD(n_components=min(SVD_DIMS, X.shape[1])).fit_transform(X)
 
-    if X.size <= 1:
-        X = vector_first.reshape(-1, 1)
-    else:
-        X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
-
-    X = transf.fit_transform(X)
-    scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=num_folds)
+    clf = initialize_classifier(surrogate_model, n_dim=min(X.shape[1], 1024))
+    scores = cross_val_score(clf, X, vector_second, scoring='neg_log_loss', cv=NUM_FOLDS)
     return 1 + np.median(scores)
 
 def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str, mi_stratified_sampling_ratio: float) -> float:
     cardinality_correction = heuristic == 'MI-numba-randomized'
+
+    try:
+        if vector_first.shape[1] == 1:
+            vector_first = vector_first.reshape(-1)
+        else:
+            vector_first = np.apply_along_axis(lambda x: np.abs(np.max(x) - np.sum(x)), 1, vector_first).reshape(-1)
+    except:
+        logger.warning('Reshaping for MI computation in place - you are considering many-one mapping')
+
     return ranking_mi_numba.mutual_info_estimator_numba(
         vector_first.astype(np.int32),
         vector_second.astype(np.int32),
@@ -65,43 +73,68 @@ def numba_mi(vector_first: np.ndarray, vector_second: np.ndarray, heuristic: str
 def sklearn_mi_adj(vector_first: np.ndarray, vector_second: np.ndarray) -> float:
     return adjusted_mutual_info_score(vector_first, vector_second)
 
-def get_importances_estimate_pairwise(combination: Tuple[str, str], reference_model_features: List[str], args: Any, tmp_df: pd.DataFrame) -> Tuple[str, str, float]:
+def generate_data_for_ranking(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple(np.ndarray, np.ndrray):
     feature_one, feature_two = combination
 
-    if feature_one not in tmp_df.columns or feature_two not in tmp_df.columns:
-        logger.info(f'{feature_one} or {feature_two} not found in the constructed data frame.')
-        return feature_one, feature_two, 0.0
+    if feature_one == args.label_column:
+        feature_one = feature_two
+        feature_two = args.label_column
+
+    if args.reference_model_JSON:
+        vector_first = tmp_df[list(reference_model_features) + [feature_one]].values
+    else:
+        vector_first = tmp_df[feature_one].values
 
-    vector_first = tmp_df[feature_one].values
     vector_second = tmp_df[feature_two].values
+    return vector_first, vector_second
+
 
-    if vector_first.size == 0 or vector_second.size == 0:
-        return feature_one, feature_two, 0.0
+def conduct_feature_ranking(vector_first: np.ndarray, vector_second: np.ndarray, args: Any) -> float:
 
-    if args.heuristic == 'MI':
+    heuristic = args.heuristic
+    score = 0.0
+
+    if heuristic == 'MI':
         score = sklearn_MI(vector_first, vector_second)
-    elif 'surrogate-' in args.heuristic:
-        X = tmp_df[reference_model_features].values if is_prior_heuristic(args) and reference_model_features else np.array([])
-        score = sklearn_surrogate(vector_first, vector_second, X, args.heuristic)
-    elif 'max-value-coverage' in args.heuristic:
+
+    elif heuristic in {'surrogate-SGD', 'surrogate-SVM', 'surrogate-SGD-RP', 'surrogate-SGD-SVD'}:
+        score = sklearn_surrogate(vector_first, vector_second, heuristic)
+
+    elif heuristic == 'max-value-coverage':
         score = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
-    elif 'MI-numba' in args.heuristic:
-        score = numba_mi(vector_first, vector_second, args.heuristic, args.mi_stratified_sampling_ratio)
-    elif args.heuristic == 'AMI':
+
+    elif heuristic == 'MI-numba-randomized':
+        score = numba_mi(vector_first, vector_second, heuristic, args.mi_stratified_sampling_ratio)
+
+    elif heuristic == 'AMI':
         score = sklearn_mi_adj(vector_first, vector_second)
-    elif args.heuristic == 'correlation-Pearson':
+
+    elif heuristic == 'correlation-Pearson':
         score = pearsonr(vector_first, vector_second)[0]
-    elif args.heuristic == 'Constant':
+
+    elif heuristic == 'Constant':
         score = 0.0
+
     else:
-        raise ValueError('Please select a valid heuristic (MI, chi2, etc.).')
+        logger.warning(f'{heuristic} not defined!')
+        score = 0.0
+
+    return score
+
+def get_importances_estimate_pairwise(combination: tuple[str, str], reference_model_features: list[str], args: Any, tmp_df: pd.DataFrame) -> tuple[str, str, float]:
+
+    feature_one, feature_two = combination
+    inputs_encoded, output_encoded = generate_data_for_ranking(combination, reference_model_features, args, tmp_df)
+
+    ranking_score = conduct_feature_ranking(inputs_encoded, output_encoded, args)
+
+    return feature_one, feature_two, ranking_score
 
-    return feature_one, feature_two, score
 
 def rank_features_3MR(
-    relevance_dict: Dict[str, float],
-    redundancy_dict: Dict[Tuple[Any, Any], Any],
-    relational_dict: Dict[Tuple[Any, Any], Any],
+    relevance_dict: dict[str, float],
+    redundancy_dict: dict[tuple[Any, Any], Any],
+    relational_dict: dict[tuple[Any, Any], Any],
     strategy: str = 'median',
     alpha: float = 1.0,
     beta: float = 1.0,
@@ -141,13 +174,21 @@ def calc_higher_order(feature: str, is_redundancy: bool = True) -> float:
 def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
     pass
 
-def initialize_classifier(surrogate_model: str):
+def initialize_classifier(surrogate_model: str, n_dim: int) -> Any:
+
     if 'surrogate-LR' in surrogate_model:
         return LogisticRegression(max_iter=100000)
+
     elif 'surrogate-SVM' in surrogate_model:
         return SVC(gamma='auto', probability=True)
+
+    elif 'surrogate-SGD-RP' in surrogate_model:
+        clf = Pipeline([('proj', random_projection.SparseRandomProjection(n_components=n_dim)), ('reg', SGDClassifier(max_iter=100000, loss='log_loss'))])
+        return clf
+
     elif 'surrogate-SGD' in surrogate_model:
         return SGDClassifier(max_iter=100000, loss='log_loss')
+
     else:
         logger.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
         return SGDClassifier(max_iter=100000, loss='log_loss')
diff --git a/outrank/algorithms/sketches/counting_cms.py b/outrank/algorithms/sketches/counting_cms.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import sys
 from collections import Counter
 
 import numpy as np

diff --git a/outrank/algorithms/sketches/counting_counters_ordinary.py b/outrank/algorithms/sketches/counting_counters_ordinary.py
@@ -26,7 +26,6 @@ def add(self, val):
 
     depth = 8
     width = 2**22
-    import numpy as np
     cms = PrimitiveConstrainedCounter()
 
     items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 10000

diff --git a/outrank/algorithms/sketches/counting_ultiloglog.py b/outrank/algorithms/sketches/counting_ultiloglog.py
@@ -62,13 +62,7 @@ def __len__(self):
 if __name__ == '__main__':
     import random
     import string
-    import time
 
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-    import tqdm
-    from pympler import asizeof
 
     def get_random_string(length):
         # choose from all lowercase letter

diff --git a/outrank/algorithms/synthetic_data_generators/cc_generator.py b/outrank/algorithms/synthetic_data_generators/cc_generator.py
@@ -1,10 +1,7 @@
 from __future__ import annotations
 
-from typing import List
 from typing import Literal
 from typing import Optional
-from typing import Tuple
-from typing import Union
 
 import numpy as np
 from numpy.typing import ArrayLike
@@ -28,7 +25,7 @@ def __init__(self, seed: int = 42):
         }
 
     def __repr__(self):
-        return f"CategoricalClassification(dataset_info={self.dataset_info})"
+        return f'CategoricalClassification(dataset_info={self.dataset_info})'
 
     def generate_data(
         self,