From 05b26d73fef246768fd517445091fbcb6fe721ef Mon Sep 17 00:00:00 2001 From: Caparrini Date: Fri, 8 Mar 2024 19:45:39 +0100 Subject: [PATCH 01/16] Fix: unused imports --- mloptimizer/genoptimizer/meta.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mloptimizer/genoptimizer/meta.py b/mloptimizer/genoptimizer/meta.py index b514188..175fee3 100644 --- a/mloptimizer/genoptimizer/meta.py +++ b/mloptimizer/genoptimizer/meta.py @@ -1,6 +1,4 @@ from mloptimizer.genoptimizer import BaseOptimizer -import json -import os class SklearnOptimizer(BaseOptimizer): From 69db326ff5d95425ba9c31c5d50317bb07a6fb22 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sat, 9 Mar 2024 01:30:34 +0100 Subject: [PATCH 02/16] Refactor: decouple evaluation stuff from Optimizer, added more metrics, log metrics for mlflow and default metrics. --- mloptimizer/aux/tracker.py | 7 +- mloptimizer/evaluation/__init__.py | 1 + mloptimizer/evaluation/evaluator.py | 52 +++++++ mloptimizer/evaluation/model_evaluation.py | 128 ++++++++++-------- mloptimizer/genoptimizer/base.py | 35 ++--- .../test/test_genoptimizer/test_meta.py | 26 ++-- mloptimizer/test/test_model_evaluation.py | 42 +++--- 7 files changed, 177 insertions(+), 114 deletions(-) create mode 100644 mloptimizer/evaluation/evaluator.py diff --git a/mloptimizer/aux/tracker.py b/mloptimizer/aux/tracker.py index 0a6607b..574a3b5 100644 --- a/mloptimizer/aux/tracker.py +++ b/mloptimizer/aux/tracker.py @@ -100,11 +100,12 @@ def log_clfs(self, classifiers_list: list, generation: int, fitness_list: list[i self.optimization_logger.info("Hyperparams: {}".format(str(classifiers_list[i].get_params()))) self.gen = generation + 1 - def log_evaluation(self, classifier, metric): - self.optimization_logger.info(f"Adding to mlflow...\nClassifier: {classifier}\nFitness: {metric}") + def log_evaluation(self, classifier, metrics): + self.optimization_logger.info(f"Adding to mlflow...\nClassifier: {classifier}\nMetrics: {metrics}") if self.use_mlflow: with self.mlflow.start_run(): self.mlflow.log_params(classifier.get_params()) # We use the generation as the step - self.mlflow.log_metric(key="fitness", value=metric, step=self.gen) + # self.mlflow.log_metric(key="fitness", value=metric, step=self.gen) + self.mlflow.log_metrics(metrics, step=self.gen) diff --git a/mloptimizer/evaluation/__init__.py b/mloptimizer/evaluation/__init__.py index eda567c..8b6a53c 100644 --- a/mloptimizer/evaluation/__init__.py +++ b/mloptimizer/evaluation/__init__.py @@ -1,2 +1,3 @@ from .model_evaluation import kfold_stratified_score, temporal_kfold_score, \ train_score, train_test_score, kfold_score +from .evaluator import Evaluator diff --git a/mloptimizer/evaluation/evaluator.py b/mloptimizer/evaluation/evaluator.py new file mode 100644 index 0000000..32a2e11 --- /dev/null +++ b/mloptimizer/evaluation/evaluator.py @@ -0,0 +1,52 @@ +from sklearn.metrics import accuracy_score, balanced_accuracy_score + + +def _default_metrics(): + return { + "accuracy": accuracy_score, + "balanced_accuracy": balanced_accuracy_score, + } + + +class Evaluator: + """ + Evaluator class to evaluate the performance of a classifier + + Parameters + ---------- + eval_function : function + The evaluation function to use to evaluate the performance of the classifier + fitness_score : str + The fitness score to use to evaluate the performance of the classifier + metrics : dict + The metrics to use to evaluate the performance of the classifier + Dictionary of the form {"metric_name": metric_function} + """ + def __init__(self, eval_function, fitness_score="balanced_accuracy", metrics=None): + if metrics is None: + self.metrics = _default_metrics() + else: + self.metrics = metrics + self.eval_function = eval_function + self.fitness_score = fitness_score + + def evaluate(self, clf, features, labels): + """ + Evaluate the performance of a classifier + + Parameters + ---------- + clf : object + The classifier to evaluate + features : array-like + The features to use to evaluate the classifier + labels : array-like + The labels to use to evaluate the classifier + + Returns + ------- + metrics : dict + Dictionary of the form {"metric_name": metric_value} + """ + metrics = self.eval_function(features, labels, clf, self.metrics) + return metrics diff --git a/mloptimizer/evaluation/model_evaluation.py b/mloptimizer/evaluation/model_evaluation.py index e6f0dc8..a34c035 100644 --- a/mloptimizer/evaluation/model_evaluation.py +++ b/mloptimizer/evaluation/model_evaluation.py @@ -2,12 +2,15 @@ import time import numpy as np -from sklearn.metrics import balanced_accuracy_score, accuracy_score from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit, \ train_test_split, KFold -def train_score(features, labels, clf, score_function=accuracy_score): +def score_metrics(labels, predictions, metrics): + return dict([(k, metrics[k](labels, predictions)) for k in metrics.keys()]) + + +def train_score(features, labels, clf, metrics): """ Trains the classifier with the features and labels. @@ -19,23 +22,24 @@ def train_score(features, labels, clf, score_function=accuracy_score): List of labels clf : object classifier with methods fit, predict and score - score_function : func - function that receives y, y_pred and return a score + metrics : dict + dictionary with metrics to be used + keys are the name of the metric and values are the metric function Returns ------- - accuracy : float - score of the classifier + metrics_output : dict + dictionary with the metrics over the train set """ - logging.info("Score metric over training data\nClassifier:{}\nscore_metric:{}".format(clf, score_function)) + # logging.info("Score metric over training data\nClassifier:{}\nscore_metric:{}".format(clf, score_function)) clf.fit(features, labels) predictions = clf.predict(features) - accuracy = score_function(labels, predictions) - logging.info("Accuracy: {:.3f}".format(round(accuracy, 3))) - return accuracy + metrics_output = score_metrics(labels, predictions, metrics) + # logging.info("Accuracy: {:.3f}".format(round(accuracy, 3))) + return metrics_output -def train_test_score(features, labels, clf, score_function=accuracy_score, test_size=0.2, random_state=None): +def train_test_score(features, labels, clf, metrics, test_size=0.2, random_state=None): """ Trains the classifier with the train set features and labels, then uses the test features and labels to create score. @@ -48,8 +52,9 @@ def train_test_score(features, labels, clf, score_function=accuracy_score, test_ List of labels clf : object Classifier with methods fit, predict, and score - score_function : func, optional - Function that receives y_true and y_pred and returns a score + metrics : dict + dictionary with metrics to be used + keys are the name of the metric and values are the metric function test_size : float, optional Proportion of the dataset to include in the test split random_state : int, optional @@ -57,8 +62,8 @@ def train_test_score(features, labels, clf, score_function=accuracy_score, test_ Returns ------- - accuracy : float - Score of the classifier on the test set + metrics_output : dict + dictionary with the metrics over the test set """ # Splitting the dataset into training and testing sets features_train, features_test, labels_train, labels_test = train_test_split( @@ -72,15 +77,15 @@ def train_test_score(features, labels, clf, score_function=accuracy_score, test_ predictions = clf.predict(features_test) # Calculating the accuracy - accuracy = score_function(labels_test, predictions) + metrics_output = score_metrics(labels_test, predictions, metrics) - logging.info("Score metric over test data\nClassifier:{}\nscore_metric:{}".format(clf, score_function)) - logging.info("Accuracy: {:.3f}".format(round(accuracy, 3))) + # logging.info("Score metric over test data\nClassifier:{}\nscore_metric:{}".format(clf, score_function)) + # logging.info("Accuracy: {:.3f}".format(round(accuracy, 3))) - return accuracy + return metrics_output -def kfold_score(features, labels, clf, score_function=accuracy_score, n_splits=5, random_state=None): +def kfold_score(features, labels, clf, metrics, n_splits=5, random_state=None): """ Evaluates the classifier using K-Fold cross-validation. @@ -92,8 +97,9 @@ def kfold_score(features, labels, clf, score_function=accuracy_score, n_splits=5 Array of labels clf : object Classifier with methods fit and predict - score_function : func, optional - Function that receives y_true and y_pred and returns a score + metrics : dict + dictionary with metrics to be used + keys are the name of the metric and values are the metric function n_splits : int, optional Number of folds. Must be at least 2 random_state : int, optional @@ -101,11 +107,11 @@ def kfold_score(features, labels, clf, score_function=accuracy_score, n_splits=5 Returns ------- - average_score : float - Average score of the classifier across all folds + average_metrics : dict + mean score among k-folds test splits """ - logging.info("K-Fold accuracy\nClassifier:{}\nn_splits:{}\nscore_metric:{}".format( - clf, n_splits, score_function)) + # logging.info("K-Fold accuracy\nClassifier:{}\nn_splits:{}\nscore_metric:{}".format( + # clf, n_splits, score_function)) kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) scores = [] @@ -115,18 +121,19 @@ def kfold_score(features, labels, clf, score_function=accuracy_score, n_splits=5 clf.fit(features_train, labels_train) predictions = clf.predict(features_test) - score = score_function(labels_test, predictions) - scores.append(score) + scores.append(score_metrics(labels_test, predictions, metrics)) - logging.info("Fold score: {:.3f}".format(score)) + # logging.info("Fold score: {:.3f}".format(score)) - average_score = np.mean(scores) - logging.info("Average K-Fold Score: {:.3f}".format(average_score)) + average_values = list(np.average(np.stack([list(d.values()) for d in scores]), axis=0)) + average_metrics = dict(zip(list(scores[0].keys()), average_values)) + # average_score = np.mean(scores) + # logging.info("Average K-Fold Score: {:.3f}".format(average_score)) - return average_score + return average_metrics -def kfold_stratified_score(features, labels, clf, n_splits=4, score_function=balanced_accuracy_score, +def kfold_stratified_score(features, labels, clf, metrics, n_splits=4, random_state=None): """ Computes KFold cross validation score using n_splits folds. @@ -144,18 +151,19 @@ def kfold_stratified_score(features, labels, clf, n_splits=4, score_function=bal classifier with methods fit, predict and score n_splits : int number of splits - score_function : func - function that receives X, y and return a score + metrics : dict + dictionary with metrics to be used + keys are the name of the metric and values are the metric function random_state : int random state for the stratified kfold Returns ------- - mean_accuracy : float + average_metrics : dict mean score among k-folds test splits """ - logging.info("KFold Stratified accuracy\nClassifier:{}\nn_splits:{}\n" - "score_metric:{}".format(clf, n_splits, score_function)) + #logging.info("KFold Stratified accuracy\nClassifier:{}\nn_splits:{}\n" + # "score_metric:{}".format(clf, n_splits, score_function)) clfs = [] @@ -197,7 +205,7 @@ def kfold_stratified_score(features, labels, clf, n_splits=4, score_function=bal labels_pred_test = clf.predict(features_test).reshape(-1) labels_predicted[test_index] = labels_pred_test - accuracies_kfold.append(score_function(labels_test, labels_pred_test)) + accuracies_kfold.append(score_metrics(labels_test, labels_pred_test, metrics)) labels_kfold.extend(labels_test) labels_kfold_predicted.extend(labels_pred_test) @@ -205,15 +213,16 @@ def kfold_stratified_score(features, labels, clf, n_splits=4, score_function=bal kcounter += 1 clfs.append(clf) - mean_accuracy = np.mean(accuracies_kfold) - std = np.std(accuracies_kfold) - logging.info("Accuracy: {:.3f} +- {:.3f}".format(round(mean_accuracy, 3), round(std, 3))) - + # mean_accuracy = np.mean(accuracies_kfold) + # std = np.std(accuracies_kfold) + # logging.info("Accuracy: {:.3f} +- {:.3f}".format(round(mean_accuracy, 3), round(std, 3))) + average_values = list(np.average(np.stack([list(d.values()) for d in accuracies_kfold]), axis=0)) + average_metrics = dict(zip(list(accuracies_kfold[0].keys()), average_values)) # return mean_accuracy, std, labels, labels_predicted, clfs - return mean_accuracy + return average_metrics -def temporal_kfold_score(features, labels, clf, n_splits=4, score_function=balanced_accuracy_score): +def temporal_kfold_score(features, labels, clf, metrics, n_splits=4): """ Computes KFold cross validation score using n_splits folds. It uses the features and labels to train the k-folds. @@ -230,18 +239,19 @@ def temporal_kfold_score(features, labels, clf, n_splits=4, score_function=balan classifier with methods fit, predict and score n_splits : int number of splits - score_function : func - function that receives X, y and return a score + metrics : dict + dictionary with metrics to be used + keys are the name of the metric and values are the metric function Returns ------- - mean_accuracy : float + average_metrics : dict mean score among k-folds test splits """ - logging.info("TemporalKFold accuracy\nClassifier:{}\nn_splits:{}\n" - "score_metric:{}".format(clf, n_splits, score_function)) - print("TemporalKFold accuracy\nClassifier:{}\nn_splits:{}\n" - "score_metric:{}".format(clf, n_splits, score_function)) + # logging.info("TemporalKFold accuracy\nClassifier:{}\nn_splits:{}\n" + # "score_metric:{}".format(clf, n_splits, score_function)) + # print("TemporalKFold accuracy\nClassifier:{}\nn_splits:{}\n" + # "score_metric:{}".format(clf, n_splits, score_function)) clfs = [] @@ -287,7 +297,7 @@ def temporal_kfold_score(features, labels, clf, n_splits=4, score_function=balan labels_pred_test = clf.predict(features_test) labels_predicted[test_index] = labels_pred_test - accuracies_kfold.append(score_function(labels_test, labels_pred_test)) + accuracies_kfold.append(score_metrics(labels_test, labels_pred_test, metrics)) labels_kfold.extend(labels_test) labels_kfold_predicted.extend(labels_pred_test) @@ -295,10 +305,12 @@ def temporal_kfold_score(features, labels, clf, n_splits=4, score_function=balan kcounter += 1 clfs.append(clf) - mean_accuracy = np.mean(accuracies_kfold) - std = np.std(accuracies_kfold) - logging.info("Accuracy: {:.2f} +- {:.2f}".format(round(mean_accuracy, 3), round(std, 3))) - print("Accuracy: {:.2f} +- {:.2f}".format(round(mean_accuracy, 3), round(std, 3))) + # mean_accuracy = np.mean(accuracies_kfold) + # std = np.std(accuracies_kfold) + average_values = list(np.average(np.stack([list(d.values()) for d in accuracies_kfold]), axis=0)) + average_metrics = dict(zip(list(accuracies_kfold[0].keys()), average_values)) + # logging.info("Accuracy: {:.2f} +- {:.2f}".format(round(mean_accuracy, 3), round(std, 3))) + # print("Accuracy: {:.2f} +- {:.2f}".format(round(mean_accuracy, 3), round(std, 3))) # return mean_accuracy, std, labels, labels_predicted, clfs - return mean_accuracy + return average_metrics diff --git a/mloptimizer/genoptimizer/base.py b/mloptimizer/genoptimizer/base.py index f6207fe..06921b4 100644 --- a/mloptimizer/genoptimizer/base.py +++ b/mloptimizer/genoptimizer/base.py @@ -15,6 +15,7 @@ from mloptimizer.plots import plotly_logbook, plotly_search_space from mloptimizer.hyperparams import HyperparameterSpace from mloptimizer.aux import Tracker, utils +from mloptimizer.evaluation import Evaluator class BaseOptimizer(object): @@ -29,10 +30,8 @@ class BaseOptimizer(object): np.array with the labels hyperparam_space : HyperparameterSpace object with the hyperparameter space: fixed and evolvable hyperparams - eval_function : func - function to evaluate the model from X, y, clf - score_function : func - function to score from y, y_pred + evaluator : Evaluator + object to evaluate the classifier eval_dict : dict dictionary with the evaluation of the individuals populations : list @@ -51,7 +50,7 @@ class BaseOptimizer(object): def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_file="mloptimizer.log", hyperparam_space: HyperparameterSpace = None, eval_function=train_score, - score_function=accuracy_score, seed=random.randint(0, 1000000), + fitness_score="accuracy", seed=random.randint(0, 1000000), use_parallel=False, use_mlflow=False): """ Creates object BaseOptimizer. @@ -68,10 +67,10 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f log file name hyperparam_space : HyperparameterSpace, optional (default=None) object with the hyperparameter space: fixed and evolvable hyperparams - eval_function : func, optional (default=kfold_stratified_score) + eval_function : func, optional (default=train_score) function to evaluate the model from X, y, clf - score_function : func, optional (default=balanced_accuracy_score) - function to score from y, y_pred + fitness_score : str, optional (default="accuracy") + fitness score to use to evaluate the performance of the classifier use_parallel : bool, optional (default=False) flag to use parallel processing use_mlflow : bool, optional (default=False) @@ -85,8 +84,8 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f # Input search space hyperparameters self.hyperparam_space = hyperparam_space - self.eval_function = eval_function - self.score_function = score_function + # ML Evaluator + self.evaluator = Evaluator(eval_function=eval_function, fitness_score=fitness_score) # State vars self.eval_dict = {} @@ -199,7 +198,7 @@ def get_clf(self, individual): """ pass - def evaluate_clf(self, individual): + def evaluate_individual(self, individual): """ Method to evaluate the classifier from an individual. It uses the eval_function to evaluate the classifier. @@ -213,10 +212,12 @@ def evaluate_clf(self, individual): mean : float mean of the evaluation """ - mean = self.eval_function(self.features, self.labels, self.get_clf(individual), - score_function=self.score_function) - self.tracker.log_evaluation(self.get_clf(individual), mean) - return (mean,) + # mean = self.evaluator.eval_function(self.features, self.labels, self.get_clf(individual), + # score_function=self.evaluator.score_function) + clf = self.get_clf(individual) + metrics = self.evaluator.evaluate(clf=clf, features=self.features, labels=self.labels) + self.tracker.log_evaluation(self.get_clf(individual), metrics) + return (metrics[self.evaluator.fitness_score],) def population_2_df(self): """ @@ -367,7 +368,7 @@ def optimize_clf(self, population: int = 10, generations: int = 3, up=[x.max_value for x in self.hyperparam_space.evolvable_hyperparams.values()], indpb=0.5) toolbox.register("select", tools.selTournament, tournsize=4) - toolbox.register("evaluate", self.evaluate_clf) + toolbox.register("evaluate", self.evaluate_individual) # History hist = tools.History() @@ -404,7 +405,7 @@ def optimize_clf(self, population: int = 10, generations: int = 3, g2.write_html(os.path.join(self.tracker.graphics_path, "logbook.html")) plt.close() - #TODO: Log the best model (or top n) + # TODO: Log the best model (or top n) return self.get_clf(hof[0]) diff --git a/mloptimizer/test/test_genoptimizer/test_meta.py b/mloptimizer/test/test_genoptimizer/test_meta.py index 4b8de32..ce89604 100644 --- a/mloptimizer/test/test_genoptimizer/test_meta.py +++ b/mloptimizer/test/test_genoptimizer/test_meta.py @@ -7,7 +7,6 @@ from xgboost import XGBClassifier from sklearn.svm import SVC from mloptimizer.evaluation import kfold_score, train_score, train_test_score -from sklearn.metrics import accuracy_score, balanced_accuracy_score import time custom_evolvable_hyperparams = { @@ -35,7 +34,7 @@ def test_sklearn_optimizer(clf_class): @pytest.mark.parametrize('use_mlflow', [True, False]) def test_mloptimizer(use_mlflow): - X, y = load_iris(return_X_y=True) + X, y = load_breast_cancer(return_X_y=True) mlopt = SklearnOptimizer(clf_class=XGBClassifier, hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), features=X, labels=y, use_mlflow=use_mlflow) @@ -43,37 +42,33 @@ def test_mloptimizer(use_mlflow): assert mlopt is not None -@pytest.mark.parametrize('target_metric', - (balanced_accuracy_score, accuracy_score)) @pytest.mark.parametrize('clf_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, XGBClassifier, SVC)) @pytest.mark.parametrize('dataset', (load_breast_cancer, load_iris)) -def test_optimizer(clf_class, dataset, target_metric): +def test_optimizer(clf_class, dataset): X, y = dataset(return_X_y=True) evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) - opt = SklearnOptimizer(features=X, labels=y, score_function=target_metric, clf_class=clf_class, + opt = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", clf_class=clf_class, hyperparam_space=evolvable_hyperparams) clf = opt.optimize_clf(2, 2) assert clf is not None -@pytest.mark.parametrize('target_metric', - (balanced_accuracy_score, accuracy_score)) @pytest.mark.parametrize('clf_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, SVC)) @pytest.mark.parametrize('dataset', (load_breast_cancer, load_iris)) -def test_optimizer_use_parallel(clf_class, dataset, target_metric): +def test_optimizer_use_parallel(clf_class, dataset): X, y = dataset(return_X_y=True) evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) my_seed = 25 population = 50 generations = 4 - opt_with_parallel = SklearnOptimizer(features=X, labels=y, score_function=target_metric, seed=my_seed, + opt_with_parallel = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", seed=my_seed, use_parallel=True, hyperparam_space=evolvable_hyperparams, clf_class=clf_class) @@ -81,7 +76,7 @@ def test_optimizer_use_parallel(clf_class, dataset, target_metric): clf_with_parallel = opt_with_parallel.optimize_clf(population, generations) end_time_parallel = time.time() - opt = SklearnOptimizer(features=X, labels=y, score_function=target_metric, seed=my_seed, use_parallel=False, + opt = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", seed=my_seed, use_parallel=False, hyperparam_space=evolvable_hyperparams, clf_class=clf_class) start_time = time.time() @@ -102,24 +97,23 @@ def test_optimizer_use_parallel(clf_class, dataset, target_metric): @pytest.mark.parametrize('clf_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, XGBClassifier, SVC)) -@pytest.mark.parametrize('target_metric', (balanced_accuracy_score, accuracy_score)) @pytest.mark.parametrize('target_score', (kfold_score, train_score, train_test_score)) -def test_reproducibility(clf_class, target_metric, target_score): +def test_reproducibility(clf_class, target_score): X, y = load_iris(return_X_y=True) evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) population = 2 generations = 2 seed = 25 distinct_seed = 2 - optimizer1 = SklearnOptimizer(features=X, labels=y, score_function=target_metric, + optimizer1 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", eval_function=target_score, seed=seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) result1 = optimizer1.optimize_clf(population=population, generations=generations) - optimizer2 = SklearnOptimizer(features=X, labels=y, score_function=target_metric, + optimizer2 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", eval_function=target_score, seed=seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) result2 = optimizer2.optimize_clf(population=population, generations=generations) - optimizer3 = SklearnOptimizer(features=X, labels=y, score_function=target_metric, + optimizer3 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", eval_function=target_score, seed=distinct_seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) result3 = optimizer3.optimize_clf(population=population, generations=generations) diff --git a/mloptimizer/test/test_model_evaluation.py b/mloptimizer/test/test_model_evaluation.py index 23b8bca..a0ed0ca 100644 --- a/mloptimizer/test/test_model_evaluation.py +++ b/mloptimizer/test/test_model_evaluation.py @@ -13,41 +13,43 @@ def classification_mock_data(): return features, labels -def test_kfold_stratified_score(classification_mock_data): +@pytest.fixture +def metrics_dict(): + return { + "accuracy": accuracy_score + } + + +def test_kfold_stratified_score(classification_mock_data, metrics_dict): features, labels = classification_mock_data clf = DecisionTreeClassifier() - score = kfold_stratified_score(features, labels, clf) - assert isinstance(score, float) - assert 0 <= score <= 1 # Score should be between 0 and 1 + metrics = kfold_stratified_score(features, labels, clf, metrics_dict) + assert isinstance(metrics, dict) -def test_temporal_kfold_score(classification_mock_data): +def test_temporal_kfold_score(classification_mock_data, metrics_dict): features, labels = classification_mock_data clf = DecisionTreeClassifier() - score = temporal_kfold_score(features, labels, clf) - assert isinstance(score, float) - assert 0 <= score <= 1 # Score should be between 0 and 1 + metrics = temporal_kfold_score(features, labels, clf, metrics_dict) + assert isinstance(metrics, dict) -def test_train_score(classification_mock_data): +def test_train_score(classification_mock_data, metrics_dict): features, labels = classification_mock_data clf = DecisionTreeClassifier() - accuracy = train_score(features, labels, clf, score_function=accuracy_score) - assert isinstance(accuracy, float) - assert 0 <= accuracy <= 1 + metrics = train_score(features, labels, clf, metrics_dict) + assert isinstance(metrics, dict) -def test_test_train_score(classification_mock_data): +def test_test_train_score(classification_mock_data, metrics_dict): features, labels = classification_mock_data clf = DecisionTreeClassifier() - accuracy = train_test_score(features, labels, clf, score_function=accuracy_score) - assert isinstance(accuracy, float) - assert 0 <= accuracy <= 1 + metrics = train_test_score(features, labels, clf, metrics_dict) + assert isinstance(metrics, dict) -def test_kfold_score(classification_mock_data): +def test_kfold_score(classification_mock_data, metrics_dict): features, labels = classification_mock_data clf = DecisionTreeClassifier() - accuracy = kfold_score(features, labels, clf, score_function=accuracy_score) - assert isinstance(accuracy, float) - assert 0 <= accuracy <= 1 + metrics = kfold_score(features, labels, clf, metrics_dict) + assert isinstance(metrics, dict) From 3f4f6b505676ff32f6f132375669ffb085afa8c4 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sat, 9 Mar 2024 12:17:40 +0100 Subject: [PATCH 03/16] Refactor: move test and code to subpackages --- examples/plot_evolution.py | 2 +- examples/plot_quickstart.py | 2 -- examples/plot_search_space.py | 2 +- mloptimizer/{ => aux}/alg_wrapper.py | 0 mloptimizer/{ => aux}/plots.py | 0 mloptimizer/evaluation/evaluator.py | 2 +- mloptimizer/genoptimizer/base.py | 10 ++++-- mloptimizer/genoptimizer/keras.py | 2 +- mloptimizer/test/test_aux/__init__.py | 0 .../test_CustomXGBClassifier.py | 2 +- .../test/{ => test_aux}/test_alg_wrapper.py | 3 +- mloptimizer/test/{ => test_aux}/test_plots.py | 2 +- mloptimizer/test/{ => test_aux}/test_utils.py | 0 mloptimizer/test/test_evaluation/__init__.py | 0 .../test_model_evaluation.py | 0 .../test/test_genoptimizer/test_meta.py | 31 +++++++++++++------ 16 files changed, 35 insertions(+), 23 deletions(-) rename mloptimizer/{ => aux}/alg_wrapper.py (100%) rename mloptimizer/{ => aux}/plots.py (100%) create mode 100644 mloptimizer/test/test_aux/__init__.py rename mloptimizer/test/{ => test_aux}/test_CustomXGBClassifier.py (95%) rename mloptimizer/test/{ => test_aux}/test_alg_wrapper.py (92%) rename mloptimizer/test/{ => test_aux}/test_plots.py (92%) rename mloptimizer/test/{ => test_aux}/test_utils.py (100%) create mode 100644 mloptimizer/test/test_evaluation/__init__.py rename mloptimizer/test/{ => test_evaluation}/test_model_evaluation.py (100%) diff --git a/examples/plot_evolution.py b/examples/plot_evolution.py index 6b81aed..d0cbe7a 100644 --- a/examples/plot_evolution.py +++ b/examples/plot_evolution.py @@ -7,7 +7,7 @@ from mloptimizer.genoptimizer import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier -from mloptimizer.plots import plotly_logbook +from mloptimizer.aux.plots import plotly_logbook import plotly import os from sklearn.datasets import load_iris diff --git a/examples/plot_quickstart.py b/examples/plot_quickstart.py index 7d517e1..dafe90c 100644 --- a/examples/plot_quickstart.py +++ b/examples/plot_quickstart.py @@ -8,8 +8,6 @@ from mloptimizer.genoptimizer import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier -from mloptimizer.plots import plotly_logbook, plotly_search_space -import plotly from sklearn.datasets import load_iris # %% diff --git a/examples/plot_search_space.py b/examples/plot_search_space.py index 98019ea..625bf8c 100644 --- a/examples/plot_search_space.py +++ b/examples/plot_search_space.py @@ -7,7 +7,7 @@ from mloptimizer.genoptimizer import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier -from mloptimizer.plots import plotly_search_space +from mloptimizer.aux.plots import plotly_search_space import plotly import os from sklearn.datasets import load_iris diff --git a/mloptimizer/alg_wrapper.py b/mloptimizer/aux/alg_wrapper.py similarity index 100% rename from mloptimizer/alg_wrapper.py rename to mloptimizer/aux/alg_wrapper.py diff --git a/mloptimizer/plots.py b/mloptimizer/aux/plots.py similarity index 100% rename from mloptimizer/plots.py rename to mloptimizer/aux/plots.py diff --git a/mloptimizer/evaluation/evaluator.py b/mloptimizer/evaluation/evaluator.py index 32a2e11..0e09af2 100644 --- a/mloptimizer/evaluation/evaluator.py +++ b/mloptimizer/evaluation/evaluator.py @@ -22,7 +22,7 @@ class Evaluator: The metrics to use to evaluate the performance of the classifier Dictionary of the form {"metric_name": metric_function} """ - def __init__(self, eval_function, fitness_score="balanced_accuracy", metrics=None): + def __init__(self, eval_function, fitness_score="accuracy", metrics=None): if metrics is None: self.metrics = _default_metrics() else: diff --git a/mloptimizer/genoptimizer/base.py b/mloptimizer/genoptimizer/base.py index 06921b4..d0788b4 100644 --- a/mloptimizer/genoptimizer/base.py +++ b/mloptimizer/genoptimizer/base.py @@ -12,7 +12,7 @@ from sklearn.metrics import accuracy_score from mloptimizer.evaluation import train_score -from mloptimizer.plots import plotly_logbook, plotly_search_space +from mloptimizer.aux.plots import plotly_logbook, plotly_search_space from mloptimizer.hyperparams import HyperparameterSpace from mloptimizer.aux import Tracker, utils from mloptimizer.evaluation import Evaluator @@ -50,7 +50,7 @@ class BaseOptimizer(object): def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_file="mloptimizer.log", hyperparam_space: HyperparameterSpace = None, eval_function=train_score, - fitness_score="accuracy", seed=random.randint(0, 1000000), + fitness_score="accuracy", metrics=None, seed=random.randint(0, 1000000), use_parallel=False, use_mlflow=False): """ Creates object BaseOptimizer. @@ -85,7 +85,11 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f self.hyperparam_space = hyperparam_space # ML Evaluator - self.evaluator = Evaluator(eval_function=eval_function, fitness_score=fitness_score) + if metrics is None: + metrics = {"accuracy": accuracy_score} + + self.evaluator = Evaluator(eval_function=eval_function, fitness_score=fitness_score, + metrics=metrics) # State vars self.eval_dict = {} diff --git a/mloptimizer/genoptimizer/keras.py b/mloptimizer/genoptimizer/keras.py index 9b55ce1..f95cb0a 100644 --- a/mloptimizer/genoptimizer/keras.py +++ b/mloptimizer/genoptimizer/keras.py @@ -1,5 +1,5 @@ from abc import ABC -from mloptimizer.alg_wrapper import generate_model +from mloptimizer.aux.alg_wrapper import generate_model from mloptimizer.genoptimizer import BaseOptimizer from mloptimizer.hyperparams import Hyperparam diff --git a/mloptimizer/test/test_aux/__init__.py b/mloptimizer/test/test_aux/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mloptimizer/test/test_CustomXGBClassifier.py b/mloptimizer/test/test_aux/test_CustomXGBClassifier.py similarity index 95% rename from mloptimizer/test/test_CustomXGBClassifier.py rename to mloptimizer/test/test_aux/test_CustomXGBClassifier.py index 8cb1acf..a43538f 100644 --- a/mloptimizer/test/test_CustomXGBClassifier.py +++ b/mloptimizer/test/test_aux/test_CustomXGBClassifier.py @@ -1,7 +1,7 @@ import pytest import numpy as np from sklearn.datasets import make_classification -from mloptimizer.alg_wrapper import CustomXGBClassifier +from mloptimizer.aux.alg_wrapper import CustomXGBClassifier @pytest.fixture diff --git a/mloptimizer/test/test_alg_wrapper.py b/mloptimizer/test/test_aux/test_alg_wrapper.py similarity index 92% rename from mloptimizer/test/test_alg_wrapper.py rename to mloptimizer/test/test_aux/test_alg_wrapper.py index 9b3e18f..9f70f43 100644 --- a/mloptimizer/test/test_alg_wrapper.py +++ b/mloptimizer/test/test_aux/test_alg_wrapper.py @@ -1,5 +1,4 @@ -import pytest -from mloptimizer.alg_wrapper import generate_model +from mloptimizer.aux.alg_wrapper import generate_model from sklearn.datasets import load_breast_cancer import numpy as np import sys, subprocess diff --git a/mloptimizer/test/test_plots.py b/mloptimizer/test/test_aux/test_plots.py similarity index 92% rename from mloptimizer/test/test_plots.py rename to mloptimizer/test/test_aux/test_plots.py index 9f7ab88..9f01ee9 100644 --- a/mloptimizer/test/test_plots.py +++ b/mloptimizer/test/test_aux/test_plots.py @@ -1,5 +1,5 @@ import pytest -from mloptimizer.plots import logbook_to_pandas, plot_logbook, plot_search_space +from mloptimizer.aux.plots import logbook_to_pandas, plot_logbook, plot_search_space from mloptimizer.genoptimizer import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier diff --git a/mloptimizer/test/test_utils.py b/mloptimizer/test/test_aux/test_utils.py similarity index 100% rename from mloptimizer/test/test_utils.py rename to mloptimizer/test/test_aux/test_utils.py diff --git a/mloptimizer/test/test_evaluation/__init__.py b/mloptimizer/test/test_evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mloptimizer/test/test_model_evaluation.py b/mloptimizer/test/test_evaluation/test_model_evaluation.py similarity index 100% rename from mloptimizer/test/test_model_evaluation.py rename to mloptimizer/test/test_evaluation/test_model_evaluation.py diff --git a/mloptimizer/test/test_genoptimizer/test_meta.py b/mloptimizer/test/test_genoptimizer/test_meta.py index ce89604..0951982 100644 --- a/mloptimizer/test/test_genoptimizer/test_meta.py +++ b/mloptimizer/test/test_genoptimizer/test_meta.py @@ -8,6 +8,7 @@ from sklearn.svm import SVC from mloptimizer.evaluation import kfold_score, train_score, train_test_score import time +from sklearn.metrics import accuracy_score custom_evolvable_hyperparams = { "min_samples_split": Hyperparam("min_samples_split", 2, 50, 'int'), @@ -19,6 +20,13 @@ } +@pytest.fixture +def default_metrics_dict(): + return { + "accuracy": accuracy_score, + } + + @pytest.mark.parametrize('clf_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, XGBClassifier, SVC)) @@ -47,10 +55,11 @@ def test_mloptimizer(use_mlflow): GradientBoostingClassifier, XGBClassifier, SVC)) @pytest.mark.parametrize('dataset', (load_breast_cancer, load_iris)) -def test_optimizer(clf_class, dataset): +def test_optimizer(clf_class, dataset, default_metrics_dict): X, y = dataset(return_X_y=True) evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) - opt = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", clf_class=clf_class, + opt = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", + metrics=default_metrics_dict, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) clf = opt.optimize_clf(2, 2) assert clf is not None @@ -61,22 +70,24 @@ def test_optimizer(clf_class, dataset): GradientBoostingClassifier, SVC)) @pytest.mark.parametrize('dataset', (load_breast_cancer, load_iris)) -def test_optimizer_use_parallel(clf_class, dataset): +def test_optimizer_use_parallel(clf_class, dataset, default_metrics_dict): X, y = dataset(return_X_y=True) evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) my_seed = 25 population = 50 generations = 4 - opt_with_parallel = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", seed=my_seed, - use_parallel=True, + opt_with_parallel = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", + metrics=default_metrics_dict, + seed=my_seed, use_parallel=True, hyperparam_space=evolvable_hyperparams, clf_class=clf_class) start_time_parallel = time.time() clf_with_parallel = opt_with_parallel.optimize_clf(population, generations) end_time_parallel = time.time() - opt = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", seed=my_seed, use_parallel=False, + opt = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, + seed=my_seed, use_parallel=False, hyperparam_space=evolvable_hyperparams, clf_class=clf_class) start_time = time.time() @@ -98,22 +109,22 @@ def test_optimizer_use_parallel(clf_class, dataset): (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, XGBClassifier, SVC)) @pytest.mark.parametrize('target_score', (kfold_score, train_score, train_test_score)) -def test_reproducibility(clf_class, target_score): +def test_reproducibility(clf_class, target_score, default_metrics_dict): X, y = load_iris(return_X_y=True) evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) population = 2 generations = 2 seed = 25 distinct_seed = 2 - optimizer1 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", + optimizer1 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, eval_function=target_score, seed=seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) result1 = optimizer1.optimize_clf(population=population, generations=generations) - optimizer2 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", + optimizer2 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, eval_function=target_score, seed=seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) result2 = optimizer2.optimize_clf(population=population, generations=generations) - optimizer3 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", + optimizer3 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, eval_function=target_score, seed=distinct_seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) result3 = optimizer3.optimize_clf(population=population, generations=generations) From d9f66bc05aedf63f18310d1b60a06f893a88e507 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sat, 9 Mar 2024 21:28:50 +0100 Subject: [PATCH 04/16] Added: test for checkpoints and decoupled the load of a checkpoint --- mloptimizer/aux/tracker.py | 16 ++++++++++++++ mloptimizer/genoptimizer/base.py | 16 +++++++++----- .../test/test_genoptimizer/test_meta.py | 22 +++++++++++++++++++ 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/mloptimizer/aux/tracker.py b/mloptimizer/aux/tracker.py index 574a3b5..6638cab 100644 --- a/mloptimizer/aux/tracker.py +++ b/mloptimizer/aux/tracker.py @@ -3,6 +3,7 @@ import shutil from datetime import datetime import importlib +import joblib class Tracker: @@ -109,3 +110,18 @@ def log_evaluation(self, classifier, metrics): # We use the generation as the step # self.mlflow.log_metric(key="fitness", value=metric, step=self.gen) self.mlflow.log_metrics(metrics, step=self.gen) + + def load_checkpoint(self, checkpoint): + + # Extract checkpoint_path from checkpoint file + self.opt_run_checkpoint_path = os.path.dirname(checkpoint) + self.opt_run_folder = os.path.dirname(self.opt_run_checkpoint_path) + self.optimization_logger, _ = init_logger(os.path.join(self.opt_run_folder, + f"opt_{os.path.basename(checkpoint)}.log")) + self.optimization_logger.info("Initiating from checkpoint {}...".format(checkpoint)) + + self.results_path = os.path.join(self.opt_run_folder, "results") + self.graphics_path = os.path.join(self.opt_run_folder, "graphics") + self.progress_path = os.path.join(self.opt_run_folder, "progress") + cp = joblib.load(checkpoint) + return cp diff --git a/mloptimizer/genoptimizer/base.py b/mloptimizer/genoptimizer/base.py index d0788b4..d266681 100644 --- a/mloptimizer/genoptimizer/base.py +++ b/mloptimizer/genoptimizer/base.py @@ -346,18 +346,22 @@ def optimize_clf(self, population: int = 10, generations: int = 3, self.logbook = tools.Logbook() if checkpoint: - self.tracker.optimization_logger, _ = utils.init_logger(os.path.join(checkpoint, "opt.log")) - cp = joblib.load(checkpoint) - self.tracker.optimization_logger.info("Initiating from checkpoint {}...".format(checkpoint)) + + # Load checkpoint + cp = self.tracker.load_checkpoint(checkpoint) + + # self.tracker.optimization_logger, _ = utils.init_logger(os.path.join(checkpoint, "opt.log")) + # cp = joblib.load(checkpoint) + # self.tracker.optimization_logger.info("Initiating from checkpoint {}...".format(checkpoint)) pop = cp['population'] start_gen = cp['generation'] + 1 hof = cp['halloffame'] self.logbook = cp['logbook'] random.setstate(cp['rndstate']) # Extract checkpoint_path from checkpoint file - self.tracker.opt_run_checkpoint_path = os.path.dirname(checkpoint) - self.tracker.results_path = os.path.join(self.tracker.opt_run_checkpoint_path, "results") - self.tracker.graphics_path = os.path.join(self.tracker.opt_run_checkpoint_path, "graphics") + # self.tracker.opt_run_checkpoint_path = os.path.dirname(checkpoint) + # self.tracker.results_path = os.path.join(self.tracker.opt_run_checkpoint_path, "results") + # self.tracker.graphics_path = os.path.join(self.tracker.opt_run_checkpoint_path, "graphics") else: self.logbook.header = ['gen', 'nevals'] + (stats.fields if stats else []) diff --git a/mloptimizer/test/test_genoptimizer/test_meta.py b/mloptimizer/test/test_genoptimizer/test_meta.py index 0951982..7243dd9 100644 --- a/mloptimizer/test/test_genoptimizer/test_meta.py +++ b/mloptimizer/test/test_genoptimizer/test_meta.py @@ -8,6 +8,7 @@ from sklearn.svm import SVC from mloptimizer.evaluation import kfold_score, train_score, train_test_score import time +import os from sklearn.metrics import accuracy_score custom_evolvable_hyperparams = { @@ -50,6 +51,27 @@ def test_mloptimizer(use_mlflow): assert mlopt is not None +def test_checkpoints(): + X, y = load_breast_cancer(return_X_y=True) + mlopt = SklearnOptimizer(clf_class=XGBClassifier, + hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), + features=X, labels=y) + clf = mlopt.optimize_clf(5, 5) + + mlopt2 = SklearnOptimizer(clf_class=XGBClassifier, + hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), + features=X, labels=y, + seed=mlopt.mlopt_seed) + + checkpoint = os.path.join(mlopt.tracker.opt_run_checkpoint_path, + os.listdir(mlopt.tracker.opt_run_checkpoint_path)[-2] + ) + clf2 = mlopt2.optimize_clf(5, 5, + checkpoint=checkpoint) + assert mlopt is not None + assert str(clf) == str(clf2) + + @pytest.mark.parametrize('clf_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, XGBClassifier, SVC)) From 090dc665491d60253382b324ede1c22afd7d913b Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sat, 9 Mar 2024 21:41:08 +0100 Subject: [PATCH 05/16] Refactor: log and save results in other method --- mloptimizer/genoptimizer/base.py | 59 +++++++++++++++----------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/mloptimizer/genoptimizer/base.py b/mloptimizer/genoptimizer/base.py index d266681..35ad1fd 100644 --- a/mloptimizer/genoptimizer/base.py +++ b/mloptimizer/genoptimizer/base.py @@ -290,6 +290,28 @@ def _read_logbook_file(self, filename=None): self.tracker.optimization_logger.error("File {} does not exist".format(filename)) return data + def _log_and_save_results(self, hof): + halloffame_classifiers = list(map(self.get_clf, hof)) + halloffame_fitness = [ind.fitness.values[:] for ind in hof] + self.tracker.log_clfs(classifiers_list=halloffame_classifiers, generation=-1, + fitness_list=halloffame_fitness) + + self._write_population_file() + self._write_logbook_file() + + hyperparam_names = list(self.hyperparam_space.evolvable_hyperparams.keys()) + hyperparam_names.append("fitness") + population_df = self.population_2_df() + df = population_df[hyperparam_names] + g = plotly_search_space(df) + g.write_html(os.path.join(self.tracker.graphics_path, "search_space.html")) + plt.close() + + g2 = plotly_logbook(self.logbook, population_df) + # g2.savefig(os.path.join(self.graphics_path, "logbook.png")) + g2.write_html(os.path.join(self.tracker.graphics_path, "logbook.html")) + plt.close() + def optimize_clf(self, population: int = 10, generations: int = 3, checkpoint: str = None, opt_run_folder_name: str = None) -> object: """ @@ -349,19 +371,12 @@ def optimize_clf(self, population: int = 10, generations: int = 3, # Load checkpoint cp = self.tracker.load_checkpoint(checkpoint) - - # self.tracker.optimization_logger, _ = utils.init_logger(os.path.join(checkpoint, "opt.log")) - # cp = joblib.load(checkpoint) - # self.tracker.optimization_logger.info("Initiating from checkpoint {}...".format(checkpoint)) pop = cp['population'] start_gen = cp['generation'] + 1 hof = cp['halloffame'] self.logbook = cp['logbook'] random.setstate(cp['rndstate']) - # Extract checkpoint_path from checkpoint file - # self.tracker.opt_run_checkpoint_path = os.path.dirname(checkpoint) - # self.tracker.results_path = os.path.join(self.tracker.opt_run_checkpoint_path, "results") - # self.tracker.graphics_path = os.path.join(self.tracker.opt_run_checkpoint_path, "graphics") + else: self.logbook.header = ['gen', 'nevals'] + (stats.fields if stats else []) @@ -389,31 +404,11 @@ def optimize_clf(self, population: int = 10, generations: int = 3, start_gen=start_gen, ngen=generations, stats=stats, halloffame=hof) - self.tracker.optimization_logger.info("LOGBOOK: \n{}".format(self.logbook)) - self.tracker.optimization_logger.info("HALL OF FAME: {} individuals".format(len(hof))) - - halloffame_classifiers = list(map(self.get_clf, hof)) - halloffame_fitness = [ind.fitness.values[:] for ind in hof] - self.tracker.log_clfs(classifiers_list=halloffame_classifiers, generation=-1, - fitness_list=halloffame_fitness) - - self._write_population_file() - self._write_logbook_file() - # self.plot_logbook(logbook=logbook) - hyperparam_names = list(self.hyperparam_space.evolvable_hyperparams.keys()) - hyperparam_names.append("fitness") - population_df = self.population_2_df() - df = population_df[hyperparam_names] - g = plotly_search_space(df) - g.write_html(os.path.join(self.tracker.graphics_path, "search_space.html")) - plt.close() - - g2 = plotly_logbook(self.logbook, population_df) - # g2.savefig(os.path.join(self.graphics_path, "logbook.png")) - g2.write_html(os.path.join(self.tracker.graphics_path, "logbook.html")) - plt.close() + # self.tracker.optimization_logger.info("LOGBOOK: \n{}".format(self.logbook)) + # self.tracker.optimization_logger.info("HALL OF FAME: {} individuals".format(len(hof))) - # TODO: Log the best model (or top n) + # Log and save results + self._log_and_save_results(hof) return self.get_clf(hof[0]) From 5b83c2e3d4bf57bae01c105c5c3a4bf6c2da0a15 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sat, 9 Mar 2024 21:43:30 +0100 Subject: [PATCH 06/16] Deleted: comments --- mloptimizer/genoptimizer/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mloptimizer/genoptimizer/base.py b/mloptimizer/genoptimizer/base.py index 35ad1fd..9490075 100644 --- a/mloptimizer/genoptimizer/base.py +++ b/mloptimizer/genoptimizer/base.py @@ -216,8 +216,6 @@ def evaluate_individual(self, individual): mean : float mean of the evaluation """ - # mean = self.evaluator.eval_function(self.features, self.labels, self.get_clf(individual), - # score_function=self.evaluator.score_function) clf = self.get_clf(individual) metrics = self.evaluator.evaluate(clf=clf, features=self.features, labels=self.labels) self.tracker.log_evaluation(self.get_clf(individual), metrics) From 4848cfff27a576dd360897d9469db98c9eefc1a6 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Mon, 11 Mar 2024 00:45:25 +0100 Subject: [PATCH 07/16] Refactor: Several refactors to decouple optimize_clf --- README.md | 2 +- docs/conf.py | 2 +- docs/sections/Concepts/score_functions.rst | 6 +- examples/plot_evolution.py | 4 +- examples/plot_quickstart.py | 2 +- examples/plot_search_space.py | 4 +- mloptimizer/aux/tracker.py | 55 ++ .../{genoptimizer => core}/__init__.py | 0 mloptimizer/core/base.py | 228 ++++++++ mloptimizer/{genoptimizer => core}/keras.py | 4 +- mloptimizer/{genoptimizer => core}/meta.py | 6 +- mloptimizer/evaluation/evaluator.py | 24 +- mloptimizer/genoptimizer/base.py | 544 ------------------ mloptimizer/test/test_aux/test_plots.py | 4 +- .../test/test_genoptimizer/test_meta.py | 8 +- setup.py | 2 +- 16 files changed, 328 insertions(+), 567 deletions(-) rename mloptimizer/{genoptimizer => core}/__init__.py (100%) create mode 100644 mloptimizer/core/base.py rename mloptimizer/{genoptimizer => core}/keras.py (92%) rename mloptimizer/{genoptimizer => core}/meta.py (87%) delete mode 100644 mloptimizer/genoptimizer/base.py diff --git a/README.md b/README.md index f0c5c04..6e19b13 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ You can get more information about the package installation at https://pypi.org/ Here's a simple example of how to optimize hyperparameters in a decision tree classifier using the iris dataset: ```python -from mloptimizer.genoptimizer import SklearnOptimizer +from mloptimizer.core import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris diff --git a/docs/conf.py b/docs/conf.py index 82e406d..ad6da04 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,7 +14,7 @@ project = 'mloptimizer' copyright = '2024, Antonio Caparrini' author = 'Antonio Caparrini' -release = '0.7.0' +release = '0.7.1' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/sections/Concepts/score_functions.rst b/docs/sections/Concepts/score_functions.rst index 32f48f1..6983448 100644 --- a/docs/sections/Concepts/score_functions.rst +++ b/docs/sections/Concepts/score_functions.rst @@ -1,6 +1,6 @@ -==================== -Score Functions -==================== +============================= +Score Functions (NEED UPDATE) +============================= The `model_evaluation.py` module in our library provides several score functions that are used to evaluate the performance of machine learning algorithms. These score functions are crucial in the context of genetic optimization, where they serve as fitness values. In genetic optimization, a fitness value determines how well an individual (in this case, a machine learning algorithm defined by its hyperparameters) performs in a given generation. The better the fitness value, the more likely the individual is to survive and reproduce in the next generation. diff --git a/examples/plot_evolution.py b/examples/plot_evolution.py index d0cbe7a..68c7723 100644 --- a/examples/plot_evolution.py +++ b/examples/plot_evolution.py @@ -4,7 +4,7 @@ mloptimizer provides a function to plot the evolution of the fitness function. """ -from mloptimizer.genoptimizer import SklearnOptimizer +from mloptimizer.core import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from mloptimizer.aux.plots import plotly_logbook @@ -40,7 +40,7 @@ # The black lines represent the max and min fitness values across all generations. # The green, red and blue line are respectively the max, min and avg fitness value for each generation. # Each grey point in the graph represents an individual. -population_df = opt.population_2_df() +population_df = opt.runs[-1].population_2_df() g_logbook = plotly_logbook(opt.logbook, population_df) plotly.io.show(g_logbook) diff --git a/examples/plot_quickstart.py b/examples/plot_quickstart.py index dafe90c..aaadac0 100644 --- a/examples/plot_quickstart.py +++ b/examples/plot_quickstart.py @@ -5,7 +5,7 @@ Firstly, we import the necessary libraries to get data and plot the results. """ -from mloptimizer.genoptimizer import SklearnOptimizer +from mloptimizer.core import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris diff --git a/examples/plot_search_space.py b/examples/plot_search_space.py index 625bf8c..f1f508a 100644 --- a/examples/plot_search_space.py +++ b/examples/plot_search_space.py @@ -4,7 +4,7 @@ mloptimizer provides a function to plot the search space of the optimization. """ -from mloptimizer.genoptimizer import SklearnOptimizer +from mloptimizer.core import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from mloptimizer.aux.plots import plotly_search_space @@ -37,7 +37,7 @@ # %% # Following we can generate the plot of the search space -population_df = opt.population_2_df() +population_df = opt.runs[-1].population_2_df() param_names = list(opt.hyperparam_space.evolvable_hyperparams.keys()) param_names.append("fitness") df = population_df[param_names] diff --git a/mloptimizer/aux/tracker.py b/mloptimizer/aux/tracker.py index 6638cab..06ac655 100644 --- a/mloptimizer/aux/tracker.py +++ b/mloptimizer/aux/tracker.py @@ -4,6 +4,7 @@ from datetime import datetime import importlib import joblib +import pandas as pd class Tracker: @@ -18,9 +19,12 @@ class Tracker: Folder where the optimization process will be stored. log_file : str Name of the log file. + use_mlflow : bool + If True, the optimization process will be tracked using MLFlow. """ def __init__(self, name, folder=os.curdir, log_file="mloptimizer.log", use_mlflow=False): + self.name = name self.gen = 0 # Main folder, current by default @@ -125,3 +129,54 @@ def load_checkpoint(self, checkpoint): self.progress_path = os.path.join(self.opt_run_folder, "progress") cp = joblib.load(checkpoint) return cp + + def write_logbook_file(self, logbook, filename=None): + """ + Method to write the logbook to a csv file + + Parameters + ---------- + logbook : ~deap.tools.Logbook + logbook of the optimization process + filename : str, optional (default=None) + filename to save the logbook + """ + if filename is None: + filename = os.path.join(self.results_path, 'logbook.csv') + pd.DataFrame(logbook).to_csv(filename, index=False) + + def write_population_file(self, populations, filename=None): + """ + Method to write the population to a csv file + + Parameters + ---------- + filename : str, optional (default=None) + filename to save the population + """ + if filename is None: + filename = os.path.join(self.results_path, 'populations.csv') + populations.sort_values(by=['fitness'], ascending=False + ).to_csv(filename, index=False) + + def start_progress_file(self, gen: int): + progress_gen_path = os.path.join(self.progress_path, "Generation_{}.csv".format(gen)) + header_progress_gen_file = "i;total;Individual;fitness\n" + with open(progress_gen_path, "w") as progress_gen_file: + progress_gen_file.write(header_progress_gen_file) + progress_gen_file.close() + self.optimization_logger.info("Generation: {}".format(gen)) + + def append_progress_file(self, gen, c, evaluations_pending, ind_formatted, fit): + self.optimization_logger.info( + "Fitting individual (informational purpose): gen {} - ind {} of {}".format( + gen, c, evaluations_pending + ) + ) + progress_gen_path = os.path.join(self.progress_path, "Generation_{}.csv".format(gen)) + with open(progress_gen_path, "a") as progress_gen_file: + progress_gen_file.write( + "{};{};{};{}\n".format(c, + evaluations_pending, + ind_formatted, fit) + ) diff --git a/mloptimizer/genoptimizer/__init__.py b/mloptimizer/core/__init__.py similarity index 100% rename from mloptimizer/genoptimizer/__init__.py rename to mloptimizer/core/__init__.py diff --git a/mloptimizer/core/base.py b/mloptimizer/core/base.py new file mode 100644 index 0000000..f7b046d --- /dev/null +++ b/mloptimizer/core/base.py @@ -0,0 +1,228 @@ +import os +import random +from abc import ABCMeta, abstractmethod +import numpy as np +from sklearn.metrics import accuracy_score + +from mloptimizer.evaluation import train_score +from mloptimizer.genetic import IndividualUtils +from mloptimizer.hyperparams import HyperparameterSpace +from mloptimizer.aux import Tracker +from mloptimizer.evaluation import Evaluator + +from mloptimizer.genetic import DeapOptimizer, GeneticAlgorithmRunner + + +class BaseOptimizer(object): + """ + Base class for the optimization of a classifier + + Attributes + ---------- + features : np.array + np.array with the features + labels : np.array + np.array with the labels + hyperparam_space : HyperparameterSpace + object with the hyperparameter space: fixed and evolvable hyperparams + evaluator : Evaluator + object to evaluate the classifier + eval_dict : dict + dictionary with the evaluation of the individuals + populations : list + list of populations + logbook : list + list of logbook + seed : int + seed for the random functions + use_parallel : bool + flag to use parallel processing + use_mlflow : bool + flag to use mlflow + """ + __metaclass__ = ABCMeta + + def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_file="mloptimizer.log", + hyperparam_space: HyperparameterSpace = None, + eval_function=train_score, + fitness_score="accuracy", metrics=None, seed=random.randint(0, 1000000), + use_parallel=False, use_mlflow=False): + """ + Creates object BaseOptimizer. + + Parameters + ---------- + features : np.array + np.array with the features + labels : np.array + np.array with the labels + folder : path, optional (default=os.curdir) + folder to store the structure of files and folders product of executions + log_file : str, optional (default="mloptimizer.log") + log file name + hyperparam_space : HyperparameterSpace, optional (default=None) + object with the hyperparameter space: fixed and evolvable hyperparams + eval_function : func, optional (default=train_score) + function to evaluate the model from X, y, clf + fitness_score : str, optional (default="accuracy") + fitness score to use to evaluate the performance of the classifier + use_parallel : bool, optional (default=False) + flag to use parallel processing + use_mlflow : bool, optional (default=False) + flag to use mlflow + seed : int, optional (default=0) + seed for the random functions (deap, models, and splits on evaluations) + """ + # Input mandatory variables + self.features = features + self.labels = labels + # Input search space hyperparameters + self.hyperparam_space = hyperparam_space + + # ML Evaluator + if metrics is None: + metrics = {"accuracy": accuracy_score} + + # State vars + self.eval_dict = {} + self.populations = [] + self.logbook = None + self.mlopt_seed = None + self.set_mlopt_seed(seed) + + # Parallel + self.use_parallel = use_parallel + + # mlflow + self.use_mlflow = use_mlflow + + # Tracker + self.tracker = Tracker(name="mloptimizer", folder=folder, log_file=log_file, use_mlflow=self.use_mlflow) + + # Evaluator + self.individual_utils = IndividualUtils(hyperparam_space=self.hyperparam_space, + clf_class=self.clf_class, mlopt_seed=self.mlopt_seed) + self.evaluator = Evaluator(features=features, labels=labels, + eval_function=eval_function, fitness_score=fitness_score, + metrics=metrics, tracker=self.tracker, + individual_utils=self.individual_utils) + + # DeapOptimizer + self.deap_optimizer = None + self.runs = [] + + def set_mlopt_seed(self, seed): + """ + Method to set the seed for the random functions + + Parameters + ---------- + seed : int + seed for the random functions + """ + self.mlopt_seed = seed + random.seed(seed) + np.random.seed(seed) + + @staticmethod + def get_subclasses(my_class): + """ + Method to get all the subclasses of a class + (in this case use to get all the classifiers that can be optimized). + + Parameters + ---------- + my_class : class + class to get the subclasses + + Returns + ------- + list + list of subclasses + """ + subclasses = my_class.__subclasses__() + if len(subclasses) == 0: + return [] + next_subclasses = [] + [next_subclasses.extend(BaseOptimizer.get_subclasses(x)) for x in subclasses] + return [*subclasses, *next_subclasses] + + @abstractmethod + def get_clf(self, individual): + """ + Method to get the classifier from an individual. Abstract method implemented in each specific optimizer. + + Parameters + ---------- + individual : individual + individual to convert + + Returns + ------- + clf : classifier + classifier specific for the optimizer + """ + pass + + def optimize_clf(self, population_size: int = 10, generations: int = 3, + cxpb=0.5, mutpb=0.5, tournsize=4, indpb=0.5, n_elites=10, + checkpoint: str = None, opt_run_folder_name: str = None) -> object: + """ + Method to optimize the classifier. It uses the custom_ea_simple method to optimize the classifier. + + Parameters + ---------- + population_size : int, optional (default=10) + number of individuals in each generation + generations : int, optional (default=3) + number of generations + cxpb : float, optional (default=0.5) + crossover probability + mutpb : float, optional (default=0.5) + mutation probability + tournsize : int, optional (default=4) + number of individuals to select in the tournament + indpb : float, optional (default=0.5) + independent probability for each attribute to be mutated + n_elites : int, optional (default=10) + number of elites to keep in the next generation + checkpoint : str, optional (default=None) + path to the checkpoint file + opt_run_folder_name : str, optional (default=None) + name of the folder where the execution will be saved + + Returns + ------- + clf : classifier + classifier with the best hyperparams + """ + # Log initialization + self.tracker.start_optimization(type(self).__name__) + + # Creation of folders and checkpoint + self.tracker.start_checkpoint(opt_run_folder_name) + + # Creation of deap optimizer + self.deap_optimizer = DeapOptimizer(hyperparam_space=self.hyperparam_space, seed=self.mlopt_seed, + use_parallel=self.use_parallel) + # Creation of genetic algorithm runner + ga_runner = GeneticAlgorithmRunner(deap_optimizer=self.deap_optimizer, + tracker=self.tracker, + seed=self.mlopt_seed, + evaluator=self.evaluator) + + # Run genetic algorithm + population, logbook, hof = ga_runner.run(population_size=population_size, n_generations=generations, + cxpb=cxpb, mutation_prob=mutpb, n_elites=n_elites, + tournsize=tournsize, indpb=indpb, checkpoint=checkpoint) + + self.runs.append(ga_runner) + self.logbook = logbook + + # self.populations = population + # self.populations.append([[ind, ind.fitness] for ind in population]) + + # Log and save results + # self._log_and_save_results(hof) + + return self.get_clf(hof[0]) diff --git a/mloptimizer/genoptimizer/keras.py b/mloptimizer/core/keras.py similarity index 92% rename from mloptimizer/genoptimizer/keras.py rename to mloptimizer/core/keras.py index f95cb0a..dfb4220 100644 --- a/mloptimizer/genoptimizer/keras.py +++ b/mloptimizer/core/keras.py @@ -1,6 +1,6 @@ from abc import ABC from mloptimizer.aux.alg_wrapper import generate_model -from mloptimizer.genoptimizer import BaseOptimizer +from mloptimizer.core import BaseOptimizer from mloptimizer.hyperparams import Hyperparam @@ -29,7 +29,7 @@ def get_clf(self, individual): except ImportError as e: print(f"{e}: Keras is not installed. Please install it to use this function.") return None - individual_dict = self.individual2dict(individual) + individual_dict = self.deap_optimizer.individual2dict(individual) print(individual_dict) clf = KerasClassifier(build_fn=generate_model, **individual_dict) diff --git a/mloptimizer/genoptimizer/meta.py b/mloptimizer/core/meta.py similarity index 87% rename from mloptimizer/genoptimizer/meta.py rename to mloptimizer/core/meta.py index 175fee3..a4a5417 100644 --- a/mloptimizer/genoptimizer/meta.py +++ b/mloptimizer/core/meta.py @@ -1,4 +1,4 @@ -from mloptimizer.genoptimizer import BaseOptimizer +from mloptimizer.core import BaseOptimizer class SklearnOptimizer(BaseOptimizer): @@ -11,10 +11,10 @@ class SklearnOptimizer(BaseOptimizer): """ def __init__(self, clf_class, *args, **kwargs): - super().__init__(*args, **kwargs) self.clf_class = clf_class + super().__init__(*args, **kwargs) def get_clf(self, individual): - individual_dict = self.individual2dict(individual) + individual_dict = self.deap_optimizer.individual2dict(individual) clf = self.clf_class(random_state=self.mlopt_seed, **individual_dict) return clf diff --git a/mloptimizer/evaluation/evaluator.py b/mloptimizer/evaluation/evaluator.py index 0e09af2..ade668f 100644 --- a/mloptimizer/evaluation/evaluator.py +++ b/mloptimizer/evaluation/evaluator.py @@ -1,4 +1,6 @@ from sklearn.metrics import accuracy_score, balanced_accuracy_score +from mloptimizer.aux import Tracker +import numpy as np def _default_metrics(): @@ -14,6 +16,10 @@ class Evaluator: Parameters ---------- + features : array-like + The features to use to evaluate the classifier + labels : array-like + The labels to use to evaluate the classifier eval_function : function The evaluation function to use to evaluate the performance of the classifier fitness_score : str @@ -21,14 +27,24 @@ class Evaluator: metrics : dict The metrics to use to evaluate the performance of the classifier Dictionary of the form {"metric_name": metric_function} + tracker : Tracker + The tracker to use to log the evaluations + individual_utils : IndividualUtils + The individual utils to use to get the classifier from the individual """ - def __init__(self, eval_function, fitness_score="accuracy", metrics=None): + + def __init__(self, features: np.array, labels: np.array, eval_function, fitness_score="accuracy", metrics=None, + tracker: Tracker = None, individual_utils=None): if metrics is None: self.metrics = _default_metrics() else: self.metrics = metrics self.eval_function = eval_function self.fitness_score = fitness_score + self.tracker = tracker + self.features = features + self.labels = labels + self.individual_utils = individual_utils def evaluate(self, clf, features, labels): """ @@ -50,3 +66,9 @@ def evaluate(self, clf, features, labels): """ metrics = self.eval_function(features, labels, clf, self.metrics) return metrics + + def evaluate_individual(self, individual): + clf = self.individual_utils.get_clf(individual) + metrics = self.evaluate(clf=clf, features=self.features, labels=self.labels) + self.tracker.log_evaluation(clf, metrics) + return (metrics[self.fitness_score],) diff --git a/mloptimizer/genoptimizer/base.py b/mloptimizer/genoptimizer/base.py deleted file mode 100644 index 9490075..0000000 --- a/mloptimizer/genoptimizer/base.py +++ /dev/null @@ -1,544 +0,0 @@ -import os -import random -from abc import ABCMeta, abstractmethod -from random import randint - -import joblib -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from deap import creator, tools, base -from deap.algorithms import varAnd -from sklearn.metrics import accuracy_score - -from mloptimizer.evaluation import train_score -from mloptimizer.aux.plots import plotly_logbook, plotly_search_space -from mloptimizer.hyperparams import HyperparameterSpace -from mloptimizer.aux import Tracker, utils -from mloptimizer.evaluation import Evaluator - - -class BaseOptimizer(object): - """ - Base class for the optimization of a classifier - - Attributes - ---------- - features : np.array - np.array with the features - labels : np.array - np.array with the labels - hyperparam_space : HyperparameterSpace - object with the hyperparameter space: fixed and evolvable hyperparams - evaluator : Evaluator - object to evaluate the classifier - eval_dict : dict - dictionary with the evaluation of the individuals - populations : list - list of populations - logbook : list - list of logbook - seed : int - seed for the random functions - use_parallel : bool - flag to use parallel processing - use_mlflow : bool - flag to use mlflow - """ - __metaclass__ = ABCMeta - - def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_file="mloptimizer.log", - hyperparam_space: HyperparameterSpace = None, - eval_function=train_score, - fitness_score="accuracy", metrics=None, seed=random.randint(0, 1000000), - use_parallel=False, use_mlflow=False): - """ - Creates object BaseOptimizer. - - Parameters - ---------- - features : np.array - np.array with the features - labels : np.array - np.array with the labels - folder : path, optional (default=os.curdir) - folder to store the structure of files and folders product of executions - log_file : str, optional (default="mloptimizer.log") - log file name - hyperparam_space : HyperparameterSpace, optional (default=None) - object with the hyperparameter space: fixed and evolvable hyperparams - eval_function : func, optional (default=train_score) - function to evaluate the model from X, y, clf - fitness_score : str, optional (default="accuracy") - fitness score to use to evaluate the performance of the classifier - use_parallel : bool, optional (default=False) - flag to use parallel processing - use_mlflow : bool, optional (default=False) - flag to use mlflow - seed : int, optional (default=0) - seed for the random functions (deap, models, and splits on evaluations) - """ - # Input mandatory variables - self.features = features - self.labels = labels - # Input search space hyperparameters - self.hyperparam_space = hyperparam_space - - # ML Evaluator - if metrics is None: - metrics = {"accuracy": accuracy_score} - - self.evaluator = Evaluator(eval_function=eval_function, fitness_score=fitness_score, - metrics=metrics) - - # State vars - self.eval_dict = {} - self.populations = [] - self.logbook = None - self.mlopt_seed = None - self.set_mlopt_seed(seed) - - # Parallel - self.use_parallel = use_parallel - - # mlflow - self.use_mlflow = use_mlflow - - # Tracker - self.tracker = Tracker(name="mloptimizer", folder=folder, log_file=log_file, use_mlflow=self.use_mlflow) - - def set_mlopt_seed(self, seed): - """ - Method to set the seed for the random functions - - Parameters - ---------- - seed : int - seed for the random functions - """ - self.mlopt_seed = seed - random.seed(seed) - np.random.seed(seed) - - @staticmethod - def get_subclasses(my_class): - """ - Method to get all the subclasses of a class - (in this case use to get all the classifiers that can be optimized). - - Parameters - ---------- - my_class : class - class to get the subclasses - - Returns - ------- - list - list of subclasses - """ - subclasses = my_class.__subclasses__() - if len(subclasses) == 0: - return [] - next_subclasses = [] - [next_subclasses.extend(BaseOptimizer.get_subclasses(x)) for x in subclasses] - return [*subclasses, *next_subclasses] - - def init_individual(self, pcls): - """ - Method to create an individual - - Parameters - ---------- - pcls : class - class of the individual - - Returns - ------- - ind : individual - individual - """ - ps = [] - for k in self.hyperparam_space.evolvable_hyperparams.keys(): - ps.append(randint(self.hyperparam_space.evolvable_hyperparams[k].min_value, - self.hyperparam_space.evolvable_hyperparams[k].max_value) - ) - individual_initialized = pcls(ps) - return individual_initialized - - def individual2dict(self, individual): - """ - Method to convert an individual to a dictionary of hyperparams - - Parameters - ---------- - individual : individual - individual to convert - - Returns - ------- - individual_dict : dict - dictionary of hyperparams - """ - individual_dict = {} - keys = list(self.hyperparam_space.evolvable_hyperparams.keys()) - for i in range(len(keys)): - individual_dict[keys[i]] = self.hyperparam_space.evolvable_hyperparams[keys[i]].correct(individual[i]) - return {**individual_dict, **self.hyperparam_space.fixed_hyperparams} - - @abstractmethod - def get_clf(self, individual): - """ - Method to get the classifier from an individual. Abstract method implemented in each specific optimizer. - - Parameters - ---------- - individual : individual - individual to convert - - Returns - ------- - clf : classifier - classifier specific for the optimizer - """ - pass - - def evaluate_individual(self, individual): - """ - Method to evaluate the classifier from an individual. It uses the eval_function to evaluate the classifier. - - Parameters - ---------- - individual : individual - individual to convert - - Returns - ------- - mean : float - mean of the evaluation - """ - clf = self.get_clf(individual) - metrics = self.evaluator.evaluate(clf=clf, features=self.features, labels=self.labels) - self.tracker.log_evaluation(self.get_clf(individual), metrics) - return (metrics[self.evaluator.fitness_score],) - - def population_2_df(self): - """ - Method to convert the population to a pandas dataframe - - Returns - ------- - df : pandas dataframe - dataframe with the population - """ - data = [] - n = 0 - for p in self.populations: - for i in p: - i_hyperparams = self.get_clf(i[0]).get_params() - i_hyperparams['fitness'] = i[1].values[0] - i_hyperparams['population'] = n - data.append(i_hyperparams) - n += 1 - - df = pd.DataFrame(data) - return df - - def _write_population_file(self, filename=None): - """ - Method to write the population to a csv file - - Parameters - ---------- - filename : str, optional (default=None) - filename to save the population - """ - if filename is None: - filename = os.path.join(self.tracker.results_path, 'populations.csv') - self.population_2_df().sort_values(by=['fitness'], ascending=False - ).to_csv(filename, index=False) - - def _write_logbook_file(self, filename=None): - """ - Method to write the logbook to a csv file - - Parameters - ---------- - filename : str, optional (default=None) - filename to save the logbook - """ - if filename is None: - filename = os.path.join(self.tracker.results_path, 'logbook.csv') - pd.DataFrame(self.logbook).to_csv(filename, index=False) - - def _read_logbook_file(self, filename=None): - """ - Method to read the logbook from a csv file - - Parameters - ---------- - filename : str, optional (default=None) - filename to read the logbook - """ - if filename is None: - filename = os.path.join(self.tracker.results_path, 'logbook.csv') - data = [] - if os.path.exists(filename): - data = pd.read_csv(filename) - else: - self.tracker.optimization_logger.error("File {} does not exist".format(filename)) - return data - - def _log_and_save_results(self, hof): - halloffame_classifiers = list(map(self.get_clf, hof)) - halloffame_fitness = [ind.fitness.values[:] for ind in hof] - self.tracker.log_clfs(classifiers_list=halloffame_classifiers, generation=-1, - fitness_list=halloffame_fitness) - - self._write_population_file() - self._write_logbook_file() - - hyperparam_names = list(self.hyperparam_space.evolvable_hyperparams.keys()) - hyperparam_names.append("fitness") - population_df = self.population_2_df() - df = population_df[hyperparam_names] - g = plotly_search_space(df) - g.write_html(os.path.join(self.tracker.graphics_path, "search_space.html")) - plt.close() - - g2 = plotly_logbook(self.logbook, population_df) - # g2.savefig(os.path.join(self.graphics_path, "logbook.png")) - g2.write_html(os.path.join(self.tracker.graphics_path, "logbook.html")) - plt.close() - - def optimize_clf(self, population: int = 10, generations: int = 3, - checkpoint: str = None, opt_run_folder_name: str = None) -> object: - """ - Method to optimize the classifier. It uses the custom_ea_simple method to optimize the classifier. - - Parameters - ---------- - population : int, optional (default=10) - number of individuals in each generation - generations : int, optional (default=3) - number of generations - checkpoint : str, optional (default=None) - path to the checkpoint file - opt_run_folder_name : str, optional (default=None) - name of the folder where the execution will be saved - - Returns - ------- - clf : classifier - classifier with the best hyperparams - """ - # Log initialization - self.tracker.start_optimization(type(self).__name__) - - # Creation of individual and population - toolbox = base.Toolbox() - stats = tools.Statistics(lambda ind: ind.fitness.values) - stats.register("avg", np.mean) - stats.register("min", np.min) - stats.register("max", np.max) - - start_gen = 0 - # Using deap, custom for decision tree - creator.create("FitnessMax", base.Fitness, weights=(1.0,)) - creator.create("Individual", list, fitness=creator.FitnessMax) - - # Parallel https://deap.readthedocs.io/en/master/tutorials/basic/part4.html - # TODO: Scoop compatibility - if self.use_parallel: - try: - import multiprocessing - pool = multiprocessing.Pool() - toolbox.register("map", pool.map) - except ImportError as e: - # self.optimization_logger.warning("Multiprocessing not available: {}".format(e)) - self.tracker.optimization_logger.warning("Multiprocessing not available: {}".format(e)) - - toolbox.register("individual", self.init_individual, creator.Individual) - toolbox.register("population", tools.initRepeat, list, toolbox.individual) - - # Tools - pop = toolbox.population(n=population) - hof = tools.HallOfFame(10) - self.logbook = tools.Logbook() - - if checkpoint: - - # Load checkpoint - cp = self.tracker.load_checkpoint(checkpoint) - pop = cp['population'] - start_gen = cp['generation'] + 1 - hof = cp['halloffame'] - self.logbook = cp['logbook'] - random.setstate(cp['rndstate']) - - else: - - self.logbook.header = ['gen', 'nevals'] + (stats.fields if stats else []) - - # Create checkpoint_path from date and algorithm - self.tracker.start_checkpoint(opt_run_folder_name) - - # Methods for genetic algorithm - toolbox.register("mate", tools.cxTwoPoint) - toolbox.register("mutate", tools.mutUniformInt, - low=[x.min_value for x in self.hyperparam_space.evolvable_hyperparams.values()], - up=[x.max_value for x in self.hyperparam_space.evolvable_hyperparams.values()], - indpb=0.5) - toolbox.register("select", tools.selTournament, tournsize=4) - toolbox.register("evaluate", self.evaluate_individual) - - # History - hist = tools.History() - toolbox.decorate("mate", hist.decorator) - toolbox.decorate("mutate", hist.decorator) - hist.update(pop) - - fpop, self.logbook, hof = self.custom_ea_simple(pop, toolbox, self.logbook, cxpb=0.5, mutpb=0.5, - checkpoint_path=self.tracker.opt_run_checkpoint_path, - start_gen=start_gen, ngen=generations, stats=stats, - halloffame=hof) - - # self.tracker.optimization_logger.info("LOGBOOK: \n{}".format(self.logbook)) - # self.tracker.optimization_logger.info("HALL OF FAME: {} individuals".format(len(hof))) - - # Log and save results - self._log_and_save_results(hof) - - return self.get_clf(hof[0]) - - def custom_ea_simple(self, population, toolbox, logbook, - cxpb, mutpb, start_gen=0, ngen=4, checkpoint_path=None, stats=None, - halloffame=None, verbose=__debug__, checkpoint_flag=True): - """This algorithm reproduce the simplest evolutionary algorithm as - presented in chapter 7 of [Back2000]_. - - :param population: A list of individuals. - :param toolbox: A :class:`~deap.base.Toolbox` that contains the evolution - operators. - :param cxpb: The probability of mating two individuals. - :param mutpb: The probability of mutating an individual. - :param ngen: The number of generation. - :param stats: A :class:`~deap.tools.Statistics` object that is updated - inplace, optional. - :param halloffame: A :class:`~deap.tools.HallOfFame` object that will - contain the best individuals, optional. - :param verbose: Whether or not to log the statistics. - :returns: The final population - :returns: A class:`~deap.tools.Logbook` with the statistics of the - evolution - - The algorithm takes in a population and evolves it in place using the - :meth:`varAnd` method. It returns the optimized population and a - :class:`~deap.tools.Logbook` with the statistics of the evolution. The - logbook will contain the generation number, the number of evaluations for - each generation and the statistics if a :class:`~deap.tools.Statistics` is - given as argument. The *cxpb* and *mutpb* arguments are passed to the - :func:`varAnd` function. The pseudocode goes as follow :: - - evaluate(population) - for g in range(ngen): - population = select(population, len(population)) - offspring = varAnd(population, toolbox, cxpb, mutpb) - evaluate(offspring) - population = offspring - - As stated in the pseudocode above, the algorithm goes as follow. First, it - evaluates the individuals with an invalid fitness. Second, it enters the - generational loop where the selection procedure is applied to entirely - replace the parental population. The 1:1 replacement ratio of this - algorithm **requires** the selection procedure to be stochastic and to - select multiple times the same individual, for example, - :func:`~deap.tools.selTournament` and :func:`~deap.tools.selRoulette`. - Third, it applies the :func:`varAnd` function to produce the next - generation population. Fourth, it evaluates the new individuals and - compute the statistics on this population. Finally, when *ngen* - generations are done, the algorithm returns a tuple with the final - population and a :class:`~deap.tools.Logbook` of the evolution. - - .. note:: - - Using a non-stochastic selection method will result in no selection as - the operator selects *n* individuals from a pool of *n*. - - This function expects the :meth:`toolbox.mate`, :meth:`toolbox.mutate`, - :meth:`toolbox.select` and :meth:`toolbox.evaluate` aliases to be - registered in the toolbox. - - .. [Back2000] Back, Fogel and Michalewicz, "Evolutionary Computation 1 : - Basic Algorithms and Operators", 2000. - """ - - if checkpoint_flag and (checkpoint_path is None or not os.path.isdir(checkpoint_path)): - error_msg = "checkpoint_flag is True and checkpoint_path {} " \ - "is not a folder or does not exist".format(checkpoint_path) - self.tracker.optimization_logger.error(error_msg) - raise NotADirectoryError(error_msg) - - # Begin the generational process - - for gen in range(start_gen, ngen + 1): - progress_gen_path = os.path.join(self.tracker.progress_path, "Generation_{}.csv".format(gen)) - progress_gen_file = open(progress_gen_path, "w") - header_progress_gen_file = "i;total;Individual;fitness\n" - progress_gen_file.write(header_progress_gen_file) - progress_gen_file.close() - self.tracker.optimization_logger.info("Generation: {}".format(gen)) - # Vary the pool of individuals - population = varAnd(population, toolbox, cxpb, mutpb) - - # Evaluate the individuals with an invalid fitness - invalid_ind = [ind for ind in population if not ind.fitness.valid] - fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) - c = 1 - evaluations_pending = len(invalid_ind) - for ind, fit in zip(invalid_ind, fitnesses): - self.tracker.optimization_logger.info( - "Fitting individual (informational purpose): gen {} - ind {} of {}".format( - gen, c, evaluations_pending - ) - ) - ind.fitness.values = fit - ind_formatted = self.individual2dict(ind) - progress_gen_file = open(progress_gen_path, "a") - progress_gen_file.write( - "{};{};{};{}\n".format(c, - evaluations_pending, - ind_formatted, fit) - ) - progress_gen_file.close() - c = c + 1 - - halloffame.update(population) - - record = stats.compile(population) if stats else {} - - logbook.record(gen=gen, nevals=len(invalid_ind), **record) - if verbose: - self.tracker.optimization_logger.info(logbook.stream) - - # Select the next generation individuals - population = toolbox.select(population, len(population)) - - halloffame_classifiers = list(map(self.get_clf, halloffame[:2])) - halloffame_fitness = [ind.fitness.values[:] for ind in halloffame[:2]] - self.tracker.log_clfs(classifiers_list=halloffame_classifiers, generation=gen, - fitness_list=halloffame_fitness) - - # Store the space hyperparams and fitness for each individual - self.populations.append([[ind, ind.fitness] for ind in population]) - - if checkpoint_flag: - # Fill the dictionary using the dict(key=value[, ...]) constructor - cp = dict(population=population, generation=gen, halloffame=halloffame, - logbook=logbook, rndstate=random.getstate()) - - cp_file = os.path.join(checkpoint_path, "cp_gen_{}.pkl".format(gen)) - joblib.dump(cp, cp_file) - self._write_population_file() - self._write_logbook_file() - - return population, logbook, halloffame diff --git a/mloptimizer/test/test_aux/test_plots.py b/mloptimizer/test/test_aux/test_plots.py index 9f01ee9..cf9f2af 100644 --- a/mloptimizer/test/test_aux/test_plots.py +++ b/mloptimizer/test/test_aux/test_plots.py @@ -1,6 +1,6 @@ import pytest from mloptimizer.aux.plots import logbook_to_pandas, plot_logbook, plot_search_space -from mloptimizer.genoptimizer import SklearnOptimizer +from mloptimizer.core import SklearnOptimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris @@ -29,6 +29,6 @@ def test_plot_logbook(default_tree_optimizer): def test_plot_search_space(default_tree_optimizer): - populations_df = default_tree_optimizer.population_2_df() + populations_df = default_tree_optimizer.runs[-1].population_2_df() fig = plot_search_space(populations_df) assert fig is not None diff --git a/mloptimizer/test/test_genoptimizer/test_meta.py b/mloptimizer/test/test_genoptimizer/test_meta.py index 7243dd9..18ffa12 100644 --- a/mloptimizer/test/test_genoptimizer/test_meta.py +++ b/mloptimizer/test/test_genoptimizer/test_meta.py @@ -1,5 +1,5 @@ import pytest -from mloptimizer.genoptimizer import SklearnOptimizer +from mloptimizer.core import SklearnOptimizer from mloptimizer.hyperparams import Hyperparam, HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier @@ -141,14 +141,14 @@ def test_reproducibility(clf_class, target_score, default_metrics_dict): optimizer1 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, eval_function=target_score, seed=seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) - result1 = optimizer1.optimize_clf(population=population, generations=generations) + result1 = optimizer1.optimize_clf(population_size=population, generations=generations) optimizer2 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, eval_function=target_score, seed=seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) - result2 = optimizer2.optimize_clf(population=population, generations=generations) + result2 = optimizer2.optimize_clf(population_size=population, generations=generations) optimizer3 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, eval_function=target_score, seed=distinct_seed, clf_class=clf_class, hyperparam_space=evolvable_hyperparams) - result3 = optimizer3.optimize_clf(population=population, generations=generations) + result3 = optimizer3.optimize_clf(population_size=population, generations=generations) assert str(result1) == str(result2) assert str(result1) != str(result3) diff --git a/setup.py b/setup.py index 4e9c3d6..90e529b 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ def read_requirements(requirements_file): # For a discussion on single-sourcing the version across setup.py and the # project code, see # https://packaging.python.org/guides/single-sourcing-package-version/ - version="0.7.0", # Required + version="0.7.1", # Required # This is a one-line description or tagline of what your project does. This # corresponds to the "Summary" metadata field: # https://packaging.python.org/specifications/core-metadata/#summary From 3a01d7b27feba97cb41bf85c412fcddfd9cb14bb Mon Sep 17 00:00:00 2001 From: Caparrini Date: Mon, 11 Mar 2024 00:57:41 +0100 Subject: [PATCH 08/16] Refactor: Added genetic subpackage to handle DEAP --- mloptimizer/genetic/__init__.py | 3 + mloptimizer/genetic/deapoptimizer.py | 118 ++++++++++ mloptimizer/genetic/garunner.py | 329 +++++++++++++++++++++++++++ mloptimizer/genetic/individual.py | 33 +++ 4 files changed, 483 insertions(+) create mode 100644 mloptimizer/genetic/__init__.py create mode 100644 mloptimizer/genetic/deapoptimizer.py create mode 100644 mloptimizer/genetic/garunner.py create mode 100644 mloptimizer/genetic/individual.py diff --git a/mloptimizer/genetic/__init__.py b/mloptimizer/genetic/__init__.py new file mode 100644 index 0000000..b41d2fb --- /dev/null +++ b/mloptimizer/genetic/__init__.py @@ -0,0 +1,3 @@ +from .deapoptimizer import DeapOptimizer +from .garunner import GeneticAlgorithmRunner +from .individual import IndividualUtils diff --git a/mloptimizer/genetic/deapoptimizer.py b/mloptimizer/genetic/deapoptimizer.py new file mode 100644 index 0000000..fb82c55 --- /dev/null +++ b/mloptimizer/genetic/deapoptimizer.py @@ -0,0 +1,118 @@ +import random +from deap import creator, base, tools +import numpy as np +from mloptimizer.hyperparams import HyperparameterSpace + + +class DeapOptimizer: + def __init__(self, hyperparam_space: HyperparameterSpace = None, use_parallel=False, seed=None): + """ + Class to start the parameters for the use of DEAP library. + + Parameters + ---------- + hyperparam_space : HyperparameterSpace + hyperparameter space + use_parallel : bool + flag to use parallel processing + seed : int + seed for the random functions + + Attributes + ---------- + hyperparam_space : HyperparameterSpace + hyperparameter space + use_parallel : bool + flag to use parallel processing + seed : int + seed for the random functions + toolbox : deap.base.Toolbox + toolbox for the optimization + eval_dict : dict + dictionary with the evaluation of the individuals + logbook : list + list of logbook + stats : deap.tools.Statistics + statistics of the optimization + """ + self.hyperparam_space = hyperparam_space + self.use_parallel = use_parallel + self.seed = seed + random.seed(seed) + np.random.seed(seed) + + self.toolbox = base.Toolbox() + self.eval_dict = {} + self.logbook = None + self.stats = None + self.setup() + + def init_individual(self, pcls): + """ + Method to create an individual + + Parameters + ---------- + pcls : class + class of the individual + + Returns + ------- + ind : individual + individual + """ + ps = [] + for k in self.hyperparam_space.evolvable_hyperparams.keys(): + ps.append(random.randint(self.hyperparam_space.evolvable_hyperparams[k].min_value, + self.hyperparam_space.evolvable_hyperparams[k].max_value) + ) + individual_initialized = pcls(ps) + return individual_initialized + + def individual2dict(self, individual): + """ + Method to convert an individual to a dictionary of hyperparams + + Parameters + ---------- + individual : individual + individual to convert + + Returns + ------- + individual_dict : dict + dictionary of hyperparams + """ + individual_dict = {} + keys = list(self.hyperparam_space.evolvable_hyperparams.keys()) + for i in range(len(keys)): + individual_dict[keys[i]] = self.hyperparam_space.evolvable_hyperparams[keys[i]].correct(individual[i]) + return {**individual_dict, **self.hyperparam_space.fixed_hyperparams} + + def setup(self): + """ + Method to set the parameters for the optimization. + """ + self.stats = tools.Statistics(lambda ind: ind.fitness.values) + self.stats.register("avg", np.mean) + self.stats.register("min", np.min) + self.stats.register("max", np.max) + start_gen = 0 + # Using deap, custom for decision tree + creator.create("FitnessMax", base.Fitness, weights=(1.0,)) + creator.create("Individual", list, fitness=creator.FitnessMax) + + # Parallel https://deap.readthedocs.io/en/master/tutorials/basic/part4.html + if self.use_parallel: + try: + #from scoop import futures + import multiprocessing + pool = multiprocessing.Pool() + self.toolbox.register("map", pool.map) + except ImportError as e: + # self.optimization_logger.warning("Multiprocessing not available: {}".format(e)) + # self.tracker.optimization_logger.warning("Multiprocessing not available: {}".format(e)) + print("Multiprocessing not available: {}".format(e)) + + self.toolbox.register("individual", self.init_individual, creator.Individual) + self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual) diff --git a/mloptimizer/genetic/garunner.py b/mloptimizer/genetic/garunner.py new file mode 100644 index 0000000..88b6c06 --- /dev/null +++ b/mloptimizer/genetic/garunner.py @@ -0,0 +1,329 @@ +import deap.base +from deap.algorithms import eaSimple, varAnd +from deap import tools +from matplotlib import pyplot as plt + +from mloptimizer.aux.plots import plotly_search_space, plotly_logbook +from mloptimizer.genetic import DeapOptimizer +from mloptimizer.aux import Tracker +import os +import joblib +import pandas as pd + + +class GeneticAlgorithmRunner: + def __init__(self, deap_optimizer: DeapOptimizer, tracker: Tracker, + seed, evaluator): + """ + Class to run the genetic algorithm + + Parameters + ---------- + deap_optimizer : ~mloptimizer.genetic.DeapOptimizer + optimizer + evaluate_individual : function + function to evaluate the individual + tracker : ~mloptimizer.aux.Tracker + tracker + seed : int + seed for the random functions + + Attributes + ---------- + populations : list + list of populations + tracker : ~mloptimizer.aux.Tracker + tracker + deap_optimizer : ~mloptimizer.genetic.DeapOptimizer + optimizer + toolbox : ~deap.base.Toolbox + toolbox + seed : int + seed for the random functions + """ + self.populations = [] + self.tracker = tracker + self.deap_optimizer = deap_optimizer + self.toolbox = self.deap_optimizer.toolbox + + self.evaluator = evaluator + self.toolbox.register("evaluate", self.evaluator.evaluate_individual) + self.seed = seed + + def simple_run(self, population_size: int, n_generations: int, cxpb: float = 0.5, mutation_prob: float = 0.5, + n_elites: int = 10, tournsize: int = 3, indpb: float = 0.05): + """ + Method to run the genetic algorithm. This uses the deap eaSimple method. + It cannot be used to track what happens in each generation. + + Parameters + ---------- + population_size : int + size of the population + n_generations : int + number of generations + cxpb : float + crossover probability + mutation_prob : float + mutation probability + n_elites : int + number of elites + tournsize : int + size of the tournament + indpb : float + probability of a gene to be mutated + + Returns + ------- + population : list + final population + logbook : ~deap.tools.Logbook + logbook + hof : ~deap.tools.HallOfFame + hall of fame + """ + hof, pop = self._pre_run(indpb=indpb, n_elites=n_elites, population_size=population_size, + tournsize=tournsize) + population, logbook = eaSimple(population=pop, toolbox=self.toolbox, cxpb=cxpb, mutpb=mutation_prob, + ngen=n_generations, stats=self.deap_optimizer.stats, halloffame=hof, + verbose=True) + + return population, logbook, hof + + def run(self, population_size: int, n_generations: int, cxpb: float = 0.5, mutation_prob: float = 0.5, + n_elites: int = 10, tournsize: int = 3, indpb: float = 0.05, checkpoint: str = None) -> object: + """ + Method to run the genetic algorithm. This uses the custom_ea_simple method. + It allows to track what happens in each generation. + + Parameters + ---------- + population_size : int + size of the population + n_generations : int + number of generations + cxpb : float + crossover probability + mutation_prob : float + mutation probability + n_elites : int + number of elites + tournsize : int + size of the tournament + indpb : float + probability of a gene to be mutated + checkpoint : str + path to the checkpoint file + + Returns + ------- + population : list + final population + logbook : ~deap.tools.Logbook + logbook + hof : ~deap.tools.HallOfFame + hall of fame + """ + hof, pop = self._pre_run(indpb=indpb, n_elites=n_elites, population_size=population_size, + tournsize=tournsize) + + population, logbook, hof = self.custom_ea_simple(population=pop, toolbox=self.toolbox, cxpb=cxpb, + mutpb=mutation_prob, + ngen=n_generations, halloffame=hof, verbose=True, + checkpoint_path=self.tracker.opt_run_checkpoint_path, + stats=self.deap_optimizer.stats) + + hyperparam_names = list(self.deap_optimizer.hyperparam_space.evolvable_hyperparams.keys()) + hyperparam_names.append("fitness") + population_df = self.population_2_df() + df = population_df[hyperparam_names] + g = plotly_search_space(df) + g.write_html(os.path.join(self.tracker.graphics_path, "search_space.html")) + plt.close() + + g2 = plotly_logbook(logbook, population_df) + g2.write_html(os.path.join(self.tracker.graphics_path, "logbook.html")) + plt.close() + + return population, logbook, hof + + def _pre_run(self, indpb: float = 0.5, n_elites: int = 10, + population_size: int = 10, tournsize: int = 4): + """ + Method to initialize the population and the hall of fame + + Parameters + ---------- + indpb : float + probability og a gene to be mutated + n_elites : int + number of elites + population_size : int + size of the population + tournsize : int + size of the tournament + + Returns + ------- + hof : ~deap.tools.HallOfFame + hall of fame + pop : list + population + """ + # Initialize population + pop = self.toolbox.population(n=population_size) + # Initialize hall of fame + hof = tools.HallOfFame(n_elites) + # Methods for genetic algorithm + self.toolbox.register("mate", tools.cxTwoPoint) + self.toolbox.register( + "mutate", tools.mutUniformInt, + low=[x.min_value for x in self.deap_optimizer.hyperparam_space.evolvable_hyperparams.values()], + up=[x.max_value for x in self.deap_optimizer.hyperparam_space.evolvable_hyperparams.values()], + indpb=indpb + ) + self.toolbox.register("select", tools.selTournament, tournsize=tournsize) + # History + hist = tools.History() + self.toolbox.decorate("mate", hist.decorator) + self.toolbox.decorate("mutate", hist.decorator) + hist.update(pop) + return hof, pop + + def custom_ea_simple(self, population: list, toolbox: deap.base.Toolbox, + cxpb: float = 0.5, mutpb: float = 0.5, start_gen: int = 0, ngen: int = 4, + checkpoint_path: str = None, + stats: deap.tools.Statistics = None, + halloffame: deap.tools.HallOfFame = None, verbose: bool = True, + checkpoint_flag: bool = True): + """ + This algorithm reproduces the simplest evolutionary algorithm as + presented in chapter 7 of [Back2000]_. + + The code is close to the ~deap.algorithms.eaSimple method, but it has been modified to track + the progress of the optimization and to save the population and the logbook in each generation. + More info can be found + `on deap documentation `__ + + Parameters + ---------- + population : list + A list of individuals. + toolbox : ~dea.base.Toolbox + A `toolbox` that contains the evolution operators. + cxpb : float + The probability of mating two individuals. + mutpb : float + The probability of mutating an individual. + start_gen : int + The starting generation number. Used in case of checkpoint. + ngen : int + The number of generations. + checkpoint_path : str + The path to the checkpoint file. + stats : ~deap.tools.Statistics + A `~deap.tools.Statistics` object that is updated inplace, optional. + halloffame : ~deap.tools.HallOfFame + A `~deap.tools.HallOfFame` object that contains the best individuals, optional. + verbose : bool + Whether or not to log the statistics. + checkpoint_flag : bool + Whether or not to save the checkpoint. + + Returns + ------- + population : list + The final population. + logbook : ~deap.tools.Logbook + A logbook containing the statistics of the evolution. + halloffame : ~deap.tools.HallOfFame + A hall of fame object that contains the best individuals. + + References + -------- + .. [Back2000] Back, Fogel and Michalewicz, "Evolutionary Computation 1 : + Basic Algorithms and Operators", 2000. + """ + logbook = tools.Logbook() + logbook.header = ['gen', 'nevals'] + (stats.fields if stats else []) + + # Verify if the checkpoint path exists if checkpoint_flag is True + if checkpoint_flag and (checkpoint_path is None or not os.path.isdir(checkpoint_path)): + error_msg = "checkpoint_flag is True and checkpoint_path {} " \ + "is not a folder or does not exist".format(checkpoint_path) + self.tracker.optimization_logger.error(error_msg) + raise NotADirectoryError(error_msg) + + # Begin the generational process + # import multiprocessing + # pool = multiprocessing.Pool() + # toolbox.register("map", pool.map) + for gen in range(start_gen, ngen + 1): + self.tracker.start_progress_file(gen) + + # Vary the pool of individuals + population = varAnd(population, toolbox, cxpb, mutpb) + + # Evaluate the individuals with an invalid fitness + invalid_ind = [ind for ind in population if not ind.fitness.valid] + fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) + c = 1 + evaluations_pending = len(invalid_ind) + for ind, fit in zip(invalid_ind, fitnesses): + ind.fitness.values = fit + ind_formatted = self.deap_optimizer.individual2dict(ind) + self.tracker.append_progress_file(gen, c, evaluations_pending, ind_formatted, fit) + + c = c + 1 + + halloffame.update(population) + + record = stats.compile(population) if stats else {} + + logbook.record(gen=gen, nevals=len(invalid_ind), **record) + if verbose: + self.tracker.optimization_logger.info(logbook.stream) + + # Select the next generation individuals + population = toolbox.select(population, len(population)) + + # halloffame_classifiers = list(map(self.get_clf, halloffame[:2])) + # halloffame_fitness = [ind.fitness.values[:] for ind in halloffame[:2]] + # self.tracker.log_clfs(classifiers_list=halloffame_classifiers, generation=gen, + # fitness_list=halloffame_fitness) + # Store the space hyperparams and fitness for each individual + self.populations.append([[ind, ind.fitness] for ind in population]) + + if checkpoint_flag: + # Fill the dictionary using the dict(key=value[, ...]) constructor + cp = dict(population=population, generation=gen, halloffame=halloffame, + logbook=logbook, rndstate=self.seed) + + cp_file = os.path.join(checkpoint_path, "cp_gen_{}.pkl".format(gen)) + joblib.dump(cp, cp_file) + self.tracker.write_population_file(self.population_2_df()) + self.tracker.write_logbook_file(logbook) + + return population, logbook, halloffame + + def population_2_df(self): + """ + Method to convert the population to a pandas dataframe + + Returns + ------- + df : pandas dataframe + dataframe with the population + """ + data = [] + n = 0 + for p in self.populations: + for i in p: + i_hyperparams = self.deap_optimizer.individual2dict(i[0]) + i_hyperparams['fitness'] = i[1].values[0] + i_hyperparams['population'] = n + data.append(i_hyperparams) + n += 1 + + df = pd.DataFrame(data) + return df diff --git a/mloptimizer/genetic/individual.py b/mloptimizer/genetic/individual.py new file mode 100644 index 0000000..275cee9 --- /dev/null +++ b/mloptimizer/genetic/individual.py @@ -0,0 +1,33 @@ +from mloptimizer.hyperparams import HyperparameterSpace + + +class IndividualUtils: + def __init__(self, hyperparam_space: HyperparameterSpace = None, clf_class=None, mlopt_seed=None): + self.hyperparam_space = hyperparam_space + self.clf_class = clf_class + self.mlopt_seed = mlopt_seed + + def get_clf(self, individual): + individual_dict = self.individual2dict(individual) + clf = self.clf_class(random_state=self.mlopt_seed, **individual_dict) + return clf + + def individual2dict(self, individual): + """ + Method to convert an individual to a dictionary of hyperparams + + Parameters + ---------- + individual : individual + individual to convert + + Returns + ------- + individual_dict : dict + dictionary of hyperparams + """ + individual_dict = {} + keys = list(self.hyperparam_space.evolvable_hyperparams.keys()) + for i in range(len(keys)): + individual_dict[keys[i]] = self.hyperparam_space.evolvable_hyperparams[keys[i]].correct(individual[i]) + return {**individual_dict, **self.hyperparam_space.fixed_hyperparams} From bd19b7e754aecf0ce2f6960027bd90b19f65a94b Mon Sep 17 00:00:00 2001 From: Caparrini Date: Wed, 20 Mar 2024 20:36:45 +0100 Subject: [PATCH 09/16] Updated: Authors updated with javiag --- README.md | 3 ++- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6e19b13..19f2f8a 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,8 @@ with examples, classes and methods reference. ## Authors -* **Antonio Caparrini** - *Owner* - [caparrini](https://github.com/caparrini) +* **Antonio Caparrini** - *Author* - [caparrini](https://github.com/caparrini) +* **Javier Arroyo Gallardo** - *Author* - [javiag](https://github.com/javiag) ## License diff --git a/setup.py b/setup.py index 90e529b..19af932 100644 --- a/setup.py +++ b/setup.py @@ -63,10 +63,10 @@ def read_requirements(requirements_file): url="https://github.com/Caparrini/mloptimizer", # Optional # This should be your name or the name of the organization which owns the # project. - author="Antonio Caparrini", # Optional + author="Antonio Caparrini López, Javier Arroyo Gallardo", # Optional # This should be a valid email address corresponding to the author listed # above. - author_email="acaparri@ucm.es", # Optional + author_email="acaparri@ucm.es, ", # Optional # Classifiers help users find your project by categorizing it. # # For a list of valid classifiers, see https://pypi.org/classifiers/ From 486a5303582ad7491e2d06d5d9c234f027781aa3 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Wed, 20 Mar 2024 20:58:18 +0100 Subject: [PATCH 10/16] Added: mermaid little example and compatibility --- docs/conf.py | 3 ++- docs/sections/Concepts/index.rst | 7 +++++++ requirements_dev.txt | 3 ++- requirements_docs.txt | 3 ++- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index ad6da04..eca7e9c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,7 +31,8 @@ 'sphinx.ext.graphviz', 'sphinx.ext.intersphinx', 'autoapi.extension', - 'sphinx_favicon' + 'sphinx_favicon', + 'sphinxcontrib.mermaid' ] templates_path = ['_templates'] diff --git a/docs/sections/Concepts/index.rst b/docs/sections/Concepts/index.rst index 2646359..6e67c3c 100644 --- a/docs/sections/Concepts/index.rst +++ b/docs/sections/Concepts/index.rst @@ -4,6 +4,13 @@ Concepts Concepts are the building blocks of the hyperparameter optimization framework. They are used to define the search space and the score function. +.. mermaid:: + + classDiagram + class SklearnOptimizer{ + +String model + } + .. toctree:: :hidden: diff --git a/requirements_dev.txt b/requirements_dev.txt index 96dd59f..6f617a5 100755 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -19,4 +19,5 @@ sphinx_book_theme sphinx_mdinclude sphinx-autoapi sphinx-favicon -mlflow \ No newline at end of file +mlflow +sphinxcontrib-mermaid \ No newline at end of file diff --git a/requirements_docs.txt b/requirements_docs.txt index 8717a71..694b76f 100644 --- a/requirements_docs.txt +++ b/requirements_docs.txt @@ -18,4 +18,5 @@ sphinx-gallery==0.14.0 sphinx_book_theme sphinx_mdinclude sphinx-autoapi -sphinx-favicon \ No newline at end of file +sphinx-favicon +sphinxcontrib-mermaid \ No newline at end of file From d8450ba90820dc94628723751aa118253c77f8e1 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Wed, 20 Mar 2024 22:35:21 +0100 Subject: [PATCH 11/16] Refactor: genoptimizer -> core, SklearnOptimizer now is Optimizer, BaseOptimizer no longer exists (old ABC Class), clf_class renamed to estimator_class --- docs/sections/Basics/overview.rst | 38 +++++++++++++-------------- docs/sections/Concepts/hyperparam.rst | 10 +++---- docs/sections/Concepts/index.rst | 4 +-- docs/sections/Concepts/parallel.rst | 6 ++--- mloptimizer/core/__init__.py | 3 +-- mloptimizer/core/base.py | 34 +++++++++--------------- mloptimizer/core/keras.py | 4 +-- mloptimizer/core/meta.py | 20 -------------- mloptimizer/genetic/individual.py | 6 ++--- mloptimizer/hyperparams/hyperspace.py | 10 +++---- 10 files changed, 53 insertions(+), 82 deletions(-) delete mode 100644 mloptimizer/core/meta.py diff --git a/docs/sections/Basics/overview.rst b/docs/sections/Basics/overview.rst index 3acad68..a6c37a5 100644 --- a/docs/sections/Basics/overview.rst +++ b/docs/sections/Basics/overview.rst @@ -4,15 +4,15 @@ Overview Introduction ------------ -The main class objects are the `SklearnOptimizer` and the `HyperparameterSpace` classes. +The main class objects are the `Optimizer` and the `HyperparameterSpace` classes. -The optimizer `SklearnOptimizer` is able to optimize any model that complies with the `sklearn` API. +The optimizer `Optimizer` is able to optimize any model that complies with the `sklearn` API. The `HyperparameterSpace` class is used to define the hyperparameters that will be optimized, either the fixed hyperparameters or the hyperparameters that will be optimized. Usage ----- -To use the `SklearnOptimizer` class: +To use the `Optimizer` class: 1. Define your features and labels. 2. Choose a model to optimize that complies with the `sklearn` API. (e.g. `XGBClassifier`). @@ -23,9 +23,9 @@ To use the `SklearnOptimizer` class: There are default HyperparameterSpaces defined in the ``conf`` folder for the most common models. You can use the HyperparameterSpace.get_default_hyperparams(class) (class e.g. XGBClassifier). -There are several parameters than can be passed to the `SklearnOptimizer` constructor: +There are several parameters than can be passed to the `Optimizer` constructor: -- `clf_class`: The class of the model to optimize. It should comply with the `sklearn` API. +- `estimator_class`: The class of the model to optimize. It should comply with the `sklearn` API. - `X`: The features of your dataset. - `y`: The labels of your dataset. - `folder`: The folder where the files and folder will be saved. Defaults to the current directory. @@ -43,13 +43,13 @@ The simplest example of using the Optimizer is: - Store your features and labels in `X` and `y` respectively. - Use HyperparameterSpace.get_default_hyperparams(XGBClassifier) to get the default hyperparameters for the model you want to optimize. -- Create an instance of `SklearnOptimizer` with your classifier class, hyperparameter space, data and leave all other parameters to their default values. +- Create an instance of `Optimizer` with your classifier class, hyperparameter space, data and leave all other parameters to their default values. - Call the `optimize_clf()` method to start the optimization process. You can pass the population size and the number of generations to the method. - The result of the optimization process will be a object of type XGBClassifier with the best hyperparameters found. .. code-block:: python - from mloptimizer.genoptimizer import SklearnOptimizer + from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace from xgboost import XGBClassifier from sklearn.datasets import load_iris @@ -61,18 +61,18 @@ The simplest example of using the Optimizer is: hyperparameter_space = HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier) # 3) Create the optimizer and optimize the classifier - opt = SklearnOptimizer(clf_class=XGBClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space) + opt = Optimizer(estimator_class=XGBClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space) clf = opt.optimize_clf(10, 10) -This will create a folder (in the current location) with name `YYYYMMDD_nnnnnnnnnn_SklearnOptimizer` +This will create a folder (in the current location) with name `YYYYMMDD_nnnnnnnnnn_Optimizer` (where `YYYYMMDD_nnnnnnnnnn` is the current timestamp) and a log file named `mloptimizer.log`. To inspect the structure of the folder and what can you find in it, please refer to the `Folder Structure` section. Custom HyperparameterSpace Example ---------------------------------- -Among the parameters that can be passed to the `SklearnOptimizer` constructor, +Among the parameters that can be passed to the `Optimizer` constructor, the `hyperaram_space` of class `HyperparameterSpace` is really important and should be aligned with the machine learning algorithm passed to the Optimizer: `fixed_hyperparams` and `evolvable_hyperparams`. @@ -107,8 +107,8 @@ An example of using custom hyperparameters is: custom_hyperparam_space = HyperparameterSpace(fixed_hyperparams, evolvable_hyperparams) # Create an instance of XGBClassifierOptimizer with custom hyperparameters - xgb_optimizer = SklearnOptimizer(clf_class=XGBClassifier,features=X, labels=y, - hyperparam_space=custom_hyperparam_space) + xgb_optimizer = Optimizer(estimator_class=XGBClassifier,features=X, labels=y, + hyperparam_space=custom_hyperparam_space) # Start the optimization process result = xgb_optimizer.optimize_clf(3, 3) @@ -127,7 +127,7 @@ Researchers often need to be able to reproduce their results. During the researc advisable to run several optimizations processes with different parameters or input data. However, if the results of the optimization process are not reproducible, it will be difficult to compare the results of the different optimization processes. -In order to make the results reproducible, the `SklearnOptimizer` have a `seed` parameter. +In order to make the results reproducible, the `Optimizer` have a `seed` parameter. This parameter is used to set the seed of the random number generator used during the optimization process. If you set the same seed, the results of the optimization process will be the same. @@ -135,7 +135,7 @@ An example of two executions of the optimization process with the same seed that .. code-block:: python - from mloptimizer.genoptimizer import SklearnOptimizer + from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace from xgboost import XGBClassifier from sklearn.datasets import load_iris @@ -146,13 +146,13 @@ An example of two executions of the optimization process with the same seed that # 2) Define the hyperparameter space (a default space is provided for some algorithms) hyperparameter_space = HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier) - # 3) Create two instances of SklearnOptimizer with the same seed - xgb_optimizer1 = SklearnOptimizer(clf_class=XGBClassifier, features=X, labels=y, - hyperparam_space = hyperparameter_space, seed=42) + # 3) Create two instances of Optimizer with the same seed + xgb_optimizer1 = Optimizer(estimator_class=XGBClassifier, features=X, labels=y, + hyperparam_space = hyperparameter_space, seed=42) result1 = xgb_optimizer1.optimize_clf(3, 3) - xgb_optimizer2 = SklearnOptimizer(clf_class=XGBClassifier, features=X, labels=y, - hyperparam_space = hyperparameter_space, seed=42) + xgb_optimizer2 = Optimizer(estimator_class=XGBClassifier, features=X, labels=y, + hyperparam_space = hyperparameter_space, seed=42) result2 = xgb_optimizer2.optimize_clf(3, 3) # Verify that the results are the same diff --git a/docs/sections/Concepts/hyperparam.rst b/docs/sections/Concepts/hyperparam.rst index 341bbae..8c9ed01 100644 --- a/docs/sections/Concepts/hyperparam.rst +++ b/docs/sections/Concepts/hyperparam.rst @@ -101,15 +101,15 @@ Here's an example of how you can create a `HyperparameterSpace` instance and pas # Then we can use the hyperparam_space instance to optimize the hyperparameters from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris - from mloptimizer.genoptimizer import SklearnOptimizer + from mloptimizer.core import Optimizer # Load the iris dataset X,y = load_iris(return_X_y=True) - tree_optimizer = SklearnOptimizer(clf_class=DecisionTreeClassifier, - hyperparam_space=hyperparam_space, - features=X, labels=y) + tree_optimizer = Optimizer(estimator_class=DecisionTreeClassifier, + hyperparam_space=hyperparam_space, + features=X, labels=y) tree_optimizer.optimize_clf(3, 3) -In this example, we define custom hyperparameters and create a `HyperparameterSpace` instance. We then use the `HyperparameterSpace` instance to optimize the hyperparameters of a `DecisionTreeClassifier` using the `SklearnOptimizer` class. +In this example, we define custom hyperparameters and create a `HyperparameterSpace` instance. We then use the `HyperparameterSpace` instance to optimize the hyperparameters of a `DecisionTreeClassifier` using the `Optimizer` class. diff --git a/docs/sections/Concepts/index.rst b/docs/sections/Concepts/index.rst index 6e67c3c..eb866f1 100644 --- a/docs/sections/Concepts/index.rst +++ b/docs/sections/Concepts/index.rst @@ -7,8 +7,8 @@ framework. They are used to define the search space and the score function. .. mermaid:: classDiagram - class SklearnOptimizer{ - +String model + class Optimizer{ + +estimator_class estimator_class } diff --git a/docs/sections/Concepts/parallel.rst b/docs/sections/Concepts/parallel.rst index b13493a..cad8f12 100644 --- a/docs/sections/Concepts/parallel.rst +++ b/docs/sections/Concepts/parallel.rst @@ -18,7 +18,7 @@ An example of the speedup that can be achieved using parallel processing is show .. code-block:: python - from mloptimizer.genoptimizer import SklearnOptimizer + from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris @@ -35,14 +35,14 @@ An example of the speedup that can be achieved using parallel processing is show population = 50 generations = 4 - opt_with_parallel = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, + opt_with_parallel = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space, seed=my_seed, use_parallel=True) start_time_parallel = time.time() clf_with_parallel = opt_with_parallel.optimize_clf(population, generations) end_time_parallel = time.time() - opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, + opt = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space, seed=my_seed, use_parallel=False) start_time = time.time() clf = opt.optimize_clf(population, generations) diff --git a/mloptimizer/core/__init__.py b/mloptimizer/core/__init__.py index adb49e7..5395363 100644 --- a/mloptimizer/core/__init__.py +++ b/mloptimizer/core/__init__.py @@ -1,3 +1,2 @@ -from .base import BaseOptimizer +from .base import Optimizer from .keras import KerasClassifierOptimizer -from .meta import SklearnOptimizer diff --git a/mloptimizer/core/base.py b/mloptimizer/core/base.py index f7b046d..93074ea 100644 --- a/mloptimizer/core/base.py +++ b/mloptimizer/core/base.py @@ -1,6 +1,5 @@ import os import random -from abc import ABCMeta, abstractmethod import numpy as np from sklearn.metrics import accuracy_score @@ -13,12 +12,14 @@ from mloptimizer.genetic import DeapOptimizer, GeneticAlgorithmRunner -class BaseOptimizer(object): +class Optimizer: """ Base class for the optimization of a classifier Attributes ---------- + estimator_class : class + class of the classifier features : np.array np.array with the features labels : np.array @@ -40,9 +41,8 @@ class BaseOptimizer(object): use_mlflow : bool flag to use mlflow """ - __metaclass__ = ABCMeta - def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_file="mloptimizer.log", + def __init__(self, estimator_class, features: np.array, labels: np.array, folder=os.curdir, log_file="mloptimizer.log", hyperparam_space: HyperparameterSpace = None, eval_function=train_score, fitness_score="accuracy", metrics=None, seed=random.randint(0, 1000000), @@ -52,6 +52,8 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f Parameters ---------- + estimator_class : class + class of the classifier features : np.array np.array with the features labels : np.array @@ -73,6 +75,8 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f seed : int, optional (default=0) seed for the random functions (deap, models, and splits on evaluations) """ + # Model class + self.estimator_class = estimator_class # Input mandatory variables self.features = features self.labels = labels @@ -101,7 +105,7 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f # Evaluator self.individual_utils = IndividualUtils(hyperparam_space=self.hyperparam_space, - clf_class=self.clf_class, mlopt_seed=self.mlopt_seed) + estimator_class=self.estimator_class, mlopt_seed=self.mlopt_seed) self.evaluator = Evaluator(features=features, labels=labels, eval_function=eval_function, fitness_score=fitness_score, metrics=metrics, tracker=self.tracker, @@ -144,25 +148,13 @@ class to get the subclasses if len(subclasses) == 0: return [] next_subclasses = [] - [next_subclasses.extend(BaseOptimizer.get_subclasses(x)) for x in subclasses] + [next_subclasses.extend(Optimizer.get_subclasses(x)) for x in subclasses] return [*subclasses, *next_subclasses] - @abstractmethod def get_clf(self, individual): - """ - Method to get the classifier from an individual. Abstract method implemented in each specific optimizer. - - Parameters - ---------- - individual : individual - individual to convert - - Returns - ------- - clf : classifier - classifier specific for the optimizer - """ - pass + individual_dict = self.deap_optimizer.individual2dict(individual) + clf = self.estimator_class(random_state=self.mlopt_seed, **individual_dict) + return clf def optimize_clf(self, population_size: int = 10, generations: int = 3, cxpb=0.5, mutpb=0.5, tournsize=4, indpb=0.5, n_elites=10, diff --git a/mloptimizer/core/keras.py b/mloptimizer/core/keras.py index dfb4220..ff44bfb 100644 --- a/mloptimizer/core/keras.py +++ b/mloptimizer/core/keras.py @@ -1,10 +1,10 @@ from abc import ABC from mloptimizer.aux.alg_wrapper import generate_model -from mloptimizer.core import BaseOptimizer +from mloptimizer.core import Optimizer from mloptimizer.hyperparams import Hyperparam -class KerasClassifierOptimizer(BaseOptimizer, ABC): +class KerasClassifierOptimizer(Optimizer): """ Class for the optimization of a gradient boosting classifier from keras.wrappers.scikit_learn.KerasClassifier. It inherits from BaseOptimizer. diff --git a/mloptimizer/core/meta.py b/mloptimizer/core/meta.py deleted file mode 100644 index a4a5417..0000000 --- a/mloptimizer/core/meta.py +++ /dev/null @@ -1,20 +0,0 @@ -from mloptimizer.core import BaseOptimizer - - -class SklearnOptimizer(BaseOptimizer): - """ - This class is a wrapper for scikit-learn classifiers. It is used to optimize hyperparameters for scikit-learn - classifiers using genetic algorithms. The class inherits from the BaseOptimizer class and implements the - get_clf and get_default_hyperparams methods. The get_clf method returns a scikit-learn classifier with the - hyperparameters specified in the individual. The get_default_hyperparams method returns a dictionary with the - default hyperparameters for the scikit-learn classifier. - """ - - def __init__(self, clf_class, *args, **kwargs): - self.clf_class = clf_class - super().__init__(*args, **kwargs) - - def get_clf(self, individual): - individual_dict = self.deap_optimizer.individual2dict(individual) - clf = self.clf_class(random_state=self.mlopt_seed, **individual_dict) - return clf diff --git a/mloptimizer/genetic/individual.py b/mloptimizer/genetic/individual.py index 275cee9..7c89ba8 100644 --- a/mloptimizer/genetic/individual.py +++ b/mloptimizer/genetic/individual.py @@ -2,14 +2,14 @@ class IndividualUtils: - def __init__(self, hyperparam_space: HyperparameterSpace = None, clf_class=None, mlopt_seed=None): + def __init__(self, hyperparam_space: HyperparameterSpace = None, estimator_class=None, mlopt_seed=None): self.hyperparam_space = hyperparam_space - self.clf_class = clf_class + self.estimator_class = estimator_class self.mlopt_seed = mlopt_seed def get_clf(self, individual): individual_dict = self.individual2dict(individual) - clf = self.clf_class(random_state=self.mlopt_seed, **individual_dict) + clf = self.estimator_class(random_state=self.mlopt_seed, **individual_dict) return clf def individual2dict(self, individual): diff --git a/mloptimizer/hyperparams/hyperspace.py b/mloptimizer/hyperparams/hyperspace.py index 5433b57..089266e 100644 --- a/mloptimizer/hyperparams/hyperspace.py +++ b/mloptimizer/hyperparams/hyperspace.py @@ -112,14 +112,14 @@ def to_json(self, file_path, overwrite=False): json.dump(hyperparams_dict, file, indent=4) @staticmethod - def get_default_hyperparameter_space(clf_class): + def get_default_hyperparameter_space(estimator_class): """ This method returns a dictionary with the default hyperparameters for the scikit-learn classifier. It reads the default_hyperparameter_spaces.json file and returns the hyperparameters for the classifier Parameters ---------- - clf_class : class + estimator_class : class The scikit-learn classifier class Returns @@ -129,15 +129,15 @@ def get_default_hyperparameter_space(clf_class): """ with open(HyperparameterSpace.default_hyperparameter_spaces_json, 'r') as file: default_hyperparams = json.load(file) - if clf_class.__name__ in default_hyperparams.keys(): + if estimator_class.__name__ in default_hyperparams.keys(): return HyperparameterSpace.from_json( str(os.path.join(os.path.dirname(os.path.abspath(__file__)), - "..", "conf", default_hyperparams[clf_class.__name__] + "..", "conf", default_hyperparams[estimator_class.__name__] ) ) ) else: - raise ValueError(f"Default hyperparameter space for {clf_class.__name__} not found") + raise ValueError(f"Default hyperparameter space for {estimator_class.__name__} not found") def __str__(self): return (f"HyperparameterSpace(fixed_hyperparams={self.fixed_hyperparams}, " From 5f2ed83f44c6c2e50257701c7d0e8c31448bcb36 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Wed, 20 Mar 2024 22:36:12 +0100 Subject: [PATCH 12/16] Refactor: genoptimizer -> core, SklearnOptimizer now is Optimizer, BaseOptimizer no longer exists (old ABC Class), clf_class renamed to estimator_class --- README.md | 4 +- docs/sections/Concepts/reproducibility.rst | 14 +-- examples/plot_evolution.py | 6 +- examples/plot_quickstart.py | 6 +- examples/plot_search_space.py | 6 +- mloptimizer/test/test_aux/test_plots.py | 6 +- .../{test_meta.py => test_base.py} | 90 +++++++++---------- 7 files changed, 66 insertions(+), 66 deletions(-) rename mloptimizer/test/test_genoptimizer/{test_meta.py => test_base.py} (61%) diff --git a/README.md b/README.md index 19f2f8a..9ae59b7 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ You can get more information about the package installation at https://pypi.org/ Here's a simple example of how to optimize hyperparameters in a decision tree classifier using the iris dataset: ```python -from mloptimizer.core import SklearnOptimizer +from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris @@ -66,7 +66,7 @@ X, y = load_iris(return_X_y=True) hyperparameter_space = HyperparameterSpace.get_default_hyperparameter_space(DecisionTreeClassifier) # 3) Create the optimizer and optimize the classifier -opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space) +opt = Optimizer(model_class=DecisionTreeClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space) # 4) Optimize the classifier, the optimization returns the best estimator found in the optimization process # - 10 generations starting with a population of 10 individuals, other parameters are set to default diff --git a/docs/sections/Concepts/reproducibility.rst b/docs/sections/Concepts/reproducibility.rst index 3130218..6f4f649 100644 --- a/docs/sections/Concepts/reproducibility.rst +++ b/docs/sections/Concepts/reproducibility.rst @@ -17,7 +17,7 @@ An example of usage is: from sklearn.datasets import load_breast_cancer as dataset from sklearn.tree import DecisionTreeClassifier - from mloptimizer.genoptimizer import SklearnOptimizer + from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace X, y = load_iris(return_X_y=True) @@ -28,19 +28,19 @@ An example of usage is: distinct_seed = 2 # It is important to run the optimization # right after the creation of the optimizer - optimizer1 = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, - hyperparam_space=default_hyperparam_space, seed=seed) + optimizer1 = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y, + hyperparam_space=default_hyperparam_space, seed=seed) result1 = optimizer1.optimize_clf(population=population, generations=generations) # WARNING: In case the optimizer2 would be created after the optimizer1, # the results would be different - optimizer2 = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, - hyperparam_space=default_hyperparam_space, seed=seed) + optimizer2 = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y, + hyperparam_space=default_hyperparam_space, seed=seed) result2 = optimizer2.optimize_clf(population=population, generations=generations) - optimizer3 = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, - hyperparam_space=default_hyperparam_space, seed=distinct_seed) + optimizer3 = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y, + hyperparam_space=default_hyperparam_space, seed=distinct_seed) result3 = optimizer3.optimize_clf(population=population, generations=generations) str(result1) == str(result2) diff --git a/examples/plot_evolution.py b/examples/plot_evolution.py index 68c7723..b27825f 100644 --- a/examples/plot_evolution.py +++ b/examples/plot_evolution.py @@ -4,7 +4,7 @@ mloptimizer provides a function to plot the evolution of the fitness function. """ -from mloptimizer.core import SklearnOptimizer +from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from mloptimizer.aux.plots import plotly_logbook @@ -25,8 +25,8 @@ # %% # We use the default TreeOptimizer class to optimize a decision tree classifier. -opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, - hyperparam_space=hyperparam_space, folder="Evolution_example") +opt = Optimizer(model_class=DecisionTreeClassifier, features=X, labels=y, + hyperparam_space=hyperparam_space, folder="Evolution_example") # %% # To optimizer the classifier we need to call the optimize_clf method. diff --git a/examples/plot_quickstart.py b/examples/plot_quickstart.py index aaadac0..b5d491e 100644 --- a/examples/plot_quickstart.py +++ b/examples/plot_quickstart.py @@ -5,7 +5,7 @@ Firstly, we import the necessary libraries to get data and plot the results. """ -from mloptimizer.core import SklearnOptimizer +from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris @@ -27,8 +27,8 @@ # the second is the vector of labels and # the third (if provided) is the name of the folder where the results of mloptimizer Optimizers are saved. # The default value for this folder is "Optimizer" -opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, - hyperparam_space=hyperparam_space, folder="Optimizer") +opt = Optimizer(model_class=DecisionTreeClassifier, features=X, labels=y, + hyperparam_space=hyperparam_space, folder="Optimizer") # %% # To optimizer the classifier we need to call the optimize_clf method. diff --git a/examples/plot_search_space.py b/examples/plot_search_space.py index f1f508a..e3f2fd3 100644 --- a/examples/plot_search_space.py +++ b/examples/plot_search_space.py @@ -4,7 +4,7 @@ mloptimizer provides a function to plot the search space of the optimization. """ -from mloptimizer.core import SklearnOptimizer +from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from mloptimizer.aux.plots import plotly_search_space @@ -25,8 +25,8 @@ # %% # We use the default TreeOptimizer class to optimize a decision tree classifier. -opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, - hyperparam_space=hyperparam_space, folder="Search_space_example") +opt = Optimizer(model_class=DecisionTreeClassifier, features=X, labels=y, + hyperparam_space=hyperparam_space, folder="Search_space_example") # %% # To optimizer the classifier we need to call the optimize_clf method. diff --git a/mloptimizer/test/test_aux/test_plots.py b/mloptimizer/test/test_aux/test_plots.py index cf9f2af..7f13dbe 100644 --- a/mloptimizer/test/test_aux/test_plots.py +++ b/mloptimizer/test/test_aux/test_plots.py @@ -1,6 +1,6 @@ import pytest from mloptimizer.aux.plots import logbook_to_pandas, plot_logbook, plot_search_space -from mloptimizer.core import SklearnOptimizer +from mloptimizer.core import Optimizer from mloptimizer.hyperparams import HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris @@ -10,8 +10,8 @@ def default_tree_optimizer(): X, y = load_iris(return_X_y=True) default_hyperparameter_space = HyperparameterSpace.get_default_hyperparameter_space(DecisionTreeClassifier) - opt = SklearnOptimizer(features=X, labels=y, clf_class=DecisionTreeClassifier, - hyperparam_space=default_hyperparameter_space) + opt = Optimizer(features=X, labels=y, estimator_class=DecisionTreeClassifier, + hyperparam_space=default_hyperparameter_space) opt.optimize_clf(10, 10) return opt diff --git a/mloptimizer/test/test_genoptimizer/test_meta.py b/mloptimizer/test/test_genoptimizer/test_base.py similarity index 61% rename from mloptimizer/test/test_genoptimizer/test_meta.py rename to mloptimizer/test/test_genoptimizer/test_base.py index 18ffa12..8b303ae 100644 --- a/mloptimizer/test/test_genoptimizer/test_meta.py +++ b/mloptimizer/test/test_genoptimizer/test_base.py @@ -1,5 +1,5 @@ import pytest -from mloptimizer.core import SklearnOptimizer +from mloptimizer.core import Optimizer from mloptimizer.hyperparams import Hyperparam, HyperparameterSpace from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier @@ -28,15 +28,15 @@ def default_metrics_dict(): } -@pytest.mark.parametrize('clf_class', +@pytest.mark.parametrize('estimator_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, XGBClassifier, SVC)) -def test_sklearn_optimizer(clf_class): +def test_sklearn_optimizer(estimator_class): X, y = load_iris(return_X_y=True) - evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) - mlopt = SklearnOptimizer(clf_class=clf_class, - hyperparam_space=evolvable_hyperparams, - features=X, labels=y) + evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(estimator_class) + mlopt = Optimizer(estimator_class=estimator_class, + hyperparam_space=evolvable_hyperparams, + features=X, labels=y) mlopt.optimize_clf(5, 5) assert mlopt is not None @@ -44,24 +44,24 @@ def test_sklearn_optimizer(clf_class): @pytest.mark.parametrize('use_mlflow', [True, False]) def test_mloptimizer(use_mlflow): X, y = load_breast_cancer(return_X_y=True) - mlopt = SklearnOptimizer(clf_class=XGBClassifier, - hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), - features=X, labels=y, use_mlflow=use_mlflow) + mlopt = Optimizer(estimator_class=XGBClassifier, + hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), + features=X, labels=y, use_mlflow=use_mlflow) mlopt.optimize_clf(5, 5) assert mlopt is not None def test_checkpoints(): X, y = load_breast_cancer(return_X_y=True) - mlopt = SklearnOptimizer(clf_class=XGBClassifier, - hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), - features=X, labels=y) + mlopt = Optimizer(estimator_class=XGBClassifier, + hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), + features=X, labels=y) clf = mlopt.optimize_clf(5, 5) - mlopt2 = SklearnOptimizer(clf_class=XGBClassifier, - hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), - features=X, labels=y, - seed=mlopt.mlopt_seed) + mlopt2 = Optimizer(estimator_class=XGBClassifier, + hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), + features=X, labels=y, + seed=mlopt.mlopt_seed) checkpoint = os.path.join(mlopt.tracker.opt_run_checkpoint_path, os.listdir(mlopt.tracker.opt_run_checkpoint_path)[-2] @@ -72,45 +72,45 @@ def test_checkpoints(): assert str(clf) == str(clf2) -@pytest.mark.parametrize('clf_class', +@pytest.mark.parametrize('estimator_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, XGBClassifier, SVC)) @pytest.mark.parametrize('dataset', (load_breast_cancer, load_iris)) -def test_optimizer(clf_class, dataset, default_metrics_dict): +def test_optimizer(estimator_class, dataset, default_metrics_dict): X, y = dataset(return_X_y=True) - evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) - opt = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", - metrics=default_metrics_dict, clf_class=clf_class, - hyperparam_space=evolvable_hyperparams) + evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(estimator_class) + opt = Optimizer(features=X, labels=y, fitness_score="accuracy", + metrics=default_metrics_dict, estimator_class=estimator_class, + hyperparam_space=evolvable_hyperparams) clf = opt.optimize_clf(2, 2) assert clf is not None -@pytest.mark.parametrize('clf_class', +@pytest.mark.parametrize('estimator_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, SVC)) @pytest.mark.parametrize('dataset', (load_breast_cancer, load_iris)) -def test_optimizer_use_parallel(clf_class, dataset, default_metrics_dict): +def test_optimizer_use_parallel(estimator_class, dataset, default_metrics_dict): X, y = dataset(return_X_y=True) - evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) + evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(estimator_class) my_seed = 25 population = 50 generations = 4 - opt_with_parallel = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", - metrics=default_metrics_dict, - seed=my_seed, use_parallel=True, - hyperparam_space=evolvable_hyperparams, clf_class=clf_class) + opt_with_parallel = Optimizer(features=X, labels=y, fitness_score="accuracy", + metrics=default_metrics_dict, + seed=my_seed, use_parallel=True, + hyperparam_space=evolvable_hyperparams, estimator_class=estimator_class) start_time_parallel = time.time() clf_with_parallel = opt_with_parallel.optimize_clf(population, generations) end_time_parallel = time.time() - opt = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, - seed=my_seed, use_parallel=False, - hyperparam_space=evolvable_hyperparams, clf_class=clf_class) + opt = Optimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, + seed=my_seed, use_parallel=False, + hyperparam_space=evolvable_hyperparams, estimator_class=estimator_class) start_time = time.time() clf = opt.optimize_clf(population, generations) @@ -127,28 +127,28 @@ def test_optimizer_use_parallel(clf_class, dataset, default_metrics_dict): assert elapsed_time_parallel < elapsed_time -@pytest.mark.parametrize('clf_class', +@pytest.mark.parametrize('estimator_class', (DecisionTreeClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, XGBClassifier, SVC)) @pytest.mark.parametrize('target_score', (kfold_score, train_score, train_test_score)) -def test_reproducibility(clf_class, target_score, default_metrics_dict): +def test_reproducibility(estimator_class, target_score, default_metrics_dict): X, y = load_iris(return_X_y=True) - evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(clf_class) + evolvable_hyperparams = HyperparameterSpace.get_default_hyperparameter_space(estimator_class) population = 2 generations = 2 seed = 25 distinct_seed = 2 - optimizer1 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, - eval_function=target_score, seed=seed, clf_class=clf_class, - hyperparam_space=evolvable_hyperparams) + optimizer1 = Optimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, + eval_function=target_score, seed=seed, estimator_class=estimator_class, + hyperparam_space=evolvable_hyperparams) result1 = optimizer1.optimize_clf(population_size=population, generations=generations) - optimizer2 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, - eval_function=target_score, seed=seed, clf_class=clf_class, - hyperparam_space=evolvable_hyperparams) + optimizer2 = Optimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, + eval_function=target_score, seed=seed, estimator_class=estimator_class, + hyperparam_space=evolvable_hyperparams) result2 = optimizer2.optimize_clf(population_size=population, generations=generations) - optimizer3 = SklearnOptimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, - eval_function=target_score, seed=distinct_seed, clf_class=clf_class, - hyperparam_space=evolvable_hyperparams) + optimizer3 = Optimizer(features=X, labels=y, fitness_score="accuracy", metrics=default_metrics_dict, + eval_function=target_score, seed=distinct_seed, estimator_class=estimator_class, + hyperparam_space=evolvable_hyperparams) result3 = optimizer3.optimize_clf(population_size=population, generations=generations) assert str(result1) == str(result2) assert str(result1) != str(result3) From e4918e4e99067d724f82715667d4dec024563cf2 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sun, 31 Mar 2024 23:22:50 +0200 Subject: [PATCH 13/16] Updated: examples --- examples/plot_evolution.py | 2 +- examples/plot_quickstart.py | 28 ++++++++++++++++++++++++++-- examples/plot_search_space.py | 2 +- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/examples/plot_evolution.py b/examples/plot_evolution.py index b27825f..e95a7f9 100644 --- a/examples/plot_evolution.py +++ b/examples/plot_evolution.py @@ -25,7 +25,7 @@ # %% # We use the default TreeOptimizer class to optimize a decision tree classifier. -opt = Optimizer(model_class=DecisionTreeClassifier, features=X, labels=y, +opt = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y, hyperparam_space=hyperparam_space, folder="Evolution_example") # %% diff --git a/examples/plot_quickstart.py b/examples/plot_quickstart.py index b5d491e..e5e2437 100644 --- a/examples/plot_quickstart.py +++ b/examples/plot_quickstart.py @@ -15,6 +15,12 @@ # Another dataset or a custom one can be used X, y = load_iris(return_X_y=True) +# %% +# Split the dataset into training and test sets +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + # %% # Define the HyperparameterSpace, you can use the default hyperparameters for the machine learning model # that you want to optimize. In this case we use the default hyperparameters for a DecisionTreeClassifier. @@ -27,16 +33,34 @@ # the second is the vector of labels and # the third (if provided) is the name of the folder where the results of mloptimizer Optimizers are saved. # The default value for this folder is "Optimizer" -opt = Optimizer(model_class=DecisionTreeClassifier, features=X, labels=y, +opt = Optimizer(estimator_class=DecisionTreeClassifier, features=X_train, labels=y_train, hyperparam_space=hyperparam_space, folder="Optimizer") # %% # To optimizer the classifier we need to call the optimize_clf method. # The first argument is the number of generations and # the second is the number of individuals in each generation. +# The method returns the best classifier with the best hyperparameters found. clf = opt.optimize_clf(10, 10) +print(clf) + # %% -# The structure of the Optimizer folder is as follows: +# Train the classifier with the best hyperparameters found +# Show the classification report and the confusion matrix +from sklearn.metrics import classification_report, confusion_matrix, \ + ConfusionMatrixDisplay +import matplotlib.pyplot as plt + +clf.fit(X_train, y_train) +y_pred = clf.predict(X_test) +cm = confusion_matrix(y_test, y_pred) +print(classification_report(y_test, y_pred)) +disp = ConfusionMatrixDisplay.from_predictions( + y_test, y_pred, display_labels=clf.classes_, + cmap=plt.cm.Blues +) +disp.plot() +plt.show() del opt diff --git a/examples/plot_search_space.py b/examples/plot_search_space.py index e3f2fd3..d4f9a78 100644 --- a/examples/plot_search_space.py +++ b/examples/plot_search_space.py @@ -25,7 +25,7 @@ # %% # We use the default TreeOptimizer class to optimize a decision tree classifier. -opt = Optimizer(model_class=DecisionTreeClassifier, features=X, labels=y, +opt = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y, hyperparam_space=hyperparam_space, folder="Search_space_example") # %% From 00844957d8441ccd81ea4dcca36eef521a0a71db Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sun, 31 Mar 2024 23:47:04 +0200 Subject: [PATCH 14/16] Updated: mermaid class diagram, it needs more updates --- docs/sections/Concepts/index.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/sections/Concepts/index.rst b/docs/sections/Concepts/index.rst index eb866f1..96d285b 100644 --- a/docs/sections/Concepts/index.rst +++ b/docs/sections/Concepts/index.rst @@ -9,7 +9,29 @@ framework. They are used to define the search space and the score function. classDiagram class Optimizer{ +estimator_class estimator_class + +HyperparameterSpace hyperspace + +Tracker tracker + +Evaluator evaluator + +IndividualUtils individual_utils + optimize_clf() } + class HyperparameterSpace{ + +dict fixed_hyperparams + +dict evolvable_hyperparams + from_json() + to_json() + } + class Evaluator{ + evaluate() + evaluate_individual() + } + class IndividualUtils{ + individual2dict() + get_clf() + } + Optimizer "1" --o "1" HyperparameterSpace + Optimizer "1" --o "1" Evaluator + Optimizer "1" --o "1" IndividualUtils .. toctree:: From d6a7012b1fe095d419534a9cc298f061097bbd0c Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sun, 31 Mar 2024 23:58:09 +0200 Subject: [PATCH 15/16] Fix: list --- docs/sections/Concepts/score_functions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sections/Concepts/score_functions.rst b/docs/sections/Concepts/score_functions.rst index 6983448..9f5fe9e 100644 --- a/docs/sections/Concepts/score_functions.rst +++ b/docs/sections/Concepts/score_functions.rst @@ -5,6 +5,7 @@ Score Functions (NEED UPDATE) The `model_evaluation.py` module in our library provides several score functions that are used to evaluate the performance of machine learning algorithms. These score functions are crucial in the context of genetic optimization, where they serve as fitness values. In genetic optimization, a fitness value determines how well an individual (in this case, a machine learning algorithm defined by its hyperparameters) performs in a given generation. The better the fitness value, the more likely the individual is to survive and reproduce in the next generation. A score function takes as input: + - The true labels of the data - The predicted labels of the data - A machine learning algorithm complying with the scikit-learn API From 5959077082ca8b567fdc5654dcd1d6a6153d9916 Mon Sep 17 00:00:00 2001 From: Caparrini Date: Sun, 31 Mar 2024 23:59:43 +0200 Subject: [PATCH 16/16] v 0.8.0 --- docs/conf.py | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index eca7e9c..720618c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,9 +12,9 @@ sys.path.insert(0, os.path.abspath('..')) project = 'mloptimizer' -copyright = '2024, Antonio Caparrini' -author = 'Antonio Caparrini' -release = '0.7.1' +copyright = '2024, Antonio Caparrini, Javier Arroyo' +author = 'Antonio Caparrini, Javier Arroyo' +release = '0.8.0' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/setup.py b/setup.py index 19af932..ea02da2 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ def read_requirements(requirements_file): # For a discussion on single-sourcing the version across setup.py and the # project code, see # https://packaging.python.org/guides/single-sourcing-package-version/ - version="0.7.1", # Required + version="0.8.0", # Required # This is a one-line description or tagline of what your project does. This # corresponds to the "Summary" metadata field: # https://packaging.python.org/specifications/core-metadata/#summary