v 0.8.0

Caparrini · Mar 31, 2024 · 68ffcf7 · 68ffcf7
2 parents c877b63 + 5959077
commit 68ffcf7
Show file tree

Hide file tree

Showing 41 changed files with 1,270 additions and 889 deletions.
diff --git a/README.md b/README.md
@@ -54,7 +54,7 @@ You can get more information about the package installation at https://pypi.org/
 Here's a simple example of how to optimize hyperparameters in a decision tree classifier using the iris dataset:
 
 ```python
-from mloptimizer.genoptimizer import SklearnOptimizer
+from mloptimizer.core import Optimizer
 from mloptimizer.hyperparams import HyperparameterSpace
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.datasets import load_iris
@@ -66,7 +66,7 @@ X, y = load_iris(return_X_y=True)
 hyperparameter_space = HyperparameterSpace.get_default_hyperparameter_space(DecisionTreeClassifier)
 
 # 3) Create the optimizer and optimize the classifier
-opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space)
+opt = Optimizer(model_class=DecisionTreeClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space)
 
 # 4) Optimize the classifier, the optimization returns the best estimator found in the optimization process
 # - 10 generations starting with a population of 10 individuals, other parameters are set to default
@@ -146,7 +146,8 @@ with examples, classes and methods reference.
 
 ## Authors
 
-* **Antonio Caparrini** - *Owner* - [caparrini](https://github.com/caparrini)
+* **Antonio Caparrini** - *Author* - [caparrini](https://github.com/caparrini)
+* **Javier Arroyo Gallardo** - *Author* - [javiag](https://github.com/javiag)
 
 ## License
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -12,9 +12,9 @@
 sys.path.insert(0, os.path.abspath('..'))
 
 project = 'mloptimizer'
-copyright = '2024, Antonio Caparrini'
-author = 'Antonio Caparrini'
-release = '0.7.1'
+copyright = '2024, Antonio Caparrini, Javier Arroyo'
+author = 'Antonio Caparrini, Javier Arroyo'
+release = '0.8.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -31,7 +31,8 @@
     'sphinx.ext.graphviz',
     'sphinx.ext.intersphinx',
     'autoapi.extension',
-    'sphinx_favicon'
+    'sphinx_favicon',
+    'sphinxcontrib.mermaid'
 ]
 
 templates_path = ['_templates']

diff --git a/docs/sections/Basics/overview.rst b/docs/sections/Basics/overview.rst
@@ -4,15 +4,15 @@ Overview
 
 Introduction
 ------------
-The main class objects are the `SklearnOptimizer` and the `HyperparameterSpace` classes.
+The main class objects are the `Optimizer` and the `HyperparameterSpace` classes.
 
-The optimizer `SklearnOptimizer` is able to optimize any model that complies with the `sklearn` API.
+The optimizer `Optimizer` is able to optimize any model that complies with the `sklearn` API.
 The `HyperparameterSpace` class is used to define the hyperparameters that will be optimized, either
 the fixed hyperparameters or the hyperparameters that will be optimized.
 
 Usage
 -----
-To use the `SklearnOptimizer` class:
+To use the `Optimizer` class:
 
 1. Define your features and labels.
 2. Choose a model to optimize that complies with the `sklearn` API. (e.g. `XGBClassifier`).
@@ -23,9 +23,9 @@ To use the `SklearnOptimizer` class:
     There are default HyperparameterSpaces defined in the ``conf`` folder for the most common models.
     You can use the HyperparameterSpace.get_default_hyperparams(class) (class e.g. XGBClassifier).
 
-There are several parameters than can be passed to the `SklearnOptimizer` constructor:
+There are several parameters than can be passed to the `Optimizer` constructor:
 
-- `clf_class`: The class of the model to optimize. It should comply with the `sklearn` API.
+- `estimator_class`: The class of the model to optimize. It should comply with the `sklearn` API.
 - `X`: The features of your dataset.
 - `y`: The labels of your dataset.
 - `folder`: The folder where the files and folder will be saved. Defaults to the current directory.
@@ -43,13 +43,13 @@ The simplest example of using the Optimizer is:
 
 - Store your features and labels in `X` and `y` respectively.
 - Use HyperparameterSpace.get_default_hyperparams(XGBClassifier) to get the default hyperparameters for the model you want to optimize.
-- Create an instance of `SklearnOptimizer` with your classifier class, hyperparameter space, data and leave all other parameters to their default values.
+- Create an instance of `Optimizer` with your classifier class, hyperparameter space, data and leave all other parameters to their default values.
 - Call the `optimize_clf()` method to start the optimization process. You can pass the population size and the number of generations to the method.
 - The result of the optimization process will be a object of type XGBClassifier with the best hyperparameters found.
 
 .. code-block:: python
 
-    from mloptimizer.genoptimizer import SklearnOptimizer
+    from mloptimizer.core import Optimizer
     from mloptimizer.hyperparams import HyperparameterSpace
     from xgboost import XGBClassifier
     from sklearn.datasets import load_iris
@@ -61,18 +61,18 @@ The simplest example of using the Optimizer is:
     hyperparameter_space = HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier)
 
     # 3) Create the optimizer and optimize the classifier
-    opt = SklearnOptimizer(clf_class=XGBClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space)
+    opt = Optimizer(estimator_class=XGBClassifier, features=X, labels=y, hyperparam_space=hyperparameter_space)
 
     clf = opt.optimize_clf(10, 10)
 
-This will create a folder (in the current location) with name `YYYYMMDD_nnnnnnnnnn_SklearnOptimizer`
+This will create a folder (in the current location) with name `YYYYMMDD_nnnnnnnnnn_Optimizer`
 (where `YYYYMMDD_nnnnnnnnnn` is the current timestamp) and a log file named `mloptimizer.log`.
 To inspect the structure of the folder and what can you find in it, please refer to the `Folder Structure` section.
 
 Custom HyperparameterSpace Example
 ----------------------------------
 
-Among the parameters that can be passed to the `SklearnOptimizer` constructor,
+Among the parameters that can be passed to the `Optimizer` constructor,
 the `hyperaram_space` of class `HyperparameterSpace` is really important
 and should be aligned with the machine learning algorithm passed to the Optimizer: `fixed_hyperparams`
 and `evolvable_hyperparams`.
@@ -107,8 +107,8 @@ An example of using custom hyperparameters is:
     custom_hyperparam_space = HyperparameterSpace(fixed_hyperparams, evolvable_hyperparams)
 
     # Create an instance of XGBClassifierOptimizer with custom hyperparameters
-    xgb_optimizer = SklearnOptimizer(clf_class=XGBClassifier,features=X, labels=y,
-                                     hyperparam_space=custom_hyperparam_space)
+    xgb_optimizer = Optimizer(estimator_class=XGBClassifier,features=X, labels=y,
+                              hyperparam_space=custom_hyperparam_space)
 
     # Start the optimization process
     result = xgb_optimizer.optimize_clf(3, 3)
@@ -127,15 +127,15 @@ Researchers often need to be able to reproduce their results. During the researc
 advisable to run several optimizations processes with different parameters or input data.
 However, if the results of the optimization process are not reproducible, it will be difficult to compare
 the results of the different optimization processes.
-In order to make the results reproducible, the `SklearnOptimizer` have a `seed` parameter.
+In order to make the results reproducible, the `Optimizer` have a `seed` parameter.
 This parameter is used to set the seed of the random number generator used during the optimization process.
 If you set the same seed, the results of the optimization process will be the same.
 
 An example of two executions of the optimization process with the same seed that will produce the same result is:
 
 .. code-block:: python
 
-    from mloptimizer.genoptimizer import SklearnOptimizer
+    from mloptimizer.core import Optimizer
     from mloptimizer.hyperparams import HyperparameterSpace
     from xgboost import XGBClassifier
     from sklearn.datasets import load_iris
@@ -146,13 +146,13 @@ An example of two executions of the optimization process with the same seed that
     # 2) Define the hyperparameter space (a default space is provided for some algorithms)
     hyperparameter_space = HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier)
 
-    # 3) Create two instances of SklearnOptimizer with the same seed
-    xgb_optimizer1 = SklearnOptimizer(clf_class=XGBClassifier, features=X, labels=y,
-                                      hyperparam_space = hyperparameter_space, seed=42)
+    # 3) Create two instances of Optimizer with the same seed
+    xgb_optimizer1 = Optimizer(estimator_class=XGBClassifier, features=X, labels=y,
+                               hyperparam_space = hyperparameter_space, seed=42)
     result1 = xgb_optimizer1.optimize_clf(3, 3)
 
-    xgb_optimizer2 = SklearnOptimizer(clf_class=XGBClassifier, features=X, labels=y,
-                                      hyperparam_space = hyperparameter_space, seed=42)
+    xgb_optimizer2 = Optimizer(estimator_class=XGBClassifier, features=X, labels=y,
+                               hyperparam_space = hyperparameter_space, seed=42)
     result2 = xgb_optimizer2.optimize_clf(3, 3)
 
     # Verify that the results are the same

diff --git a/docs/sections/Concepts/hyperparam.rst b/docs/sections/Concepts/hyperparam.rst
@@ -101,15 +101,15 @@ Here's an example of how you can create a `HyperparameterSpace` instance and pas
    # Then we can use the hyperparam_space instance to optimize the hyperparameters
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.datasets import load_iris
-   from mloptimizer.genoptimizer import SklearnOptimizer
+   from mloptimizer.core import Optimizer
 
    # Load the iris dataset
    X,y = load_iris(return_X_y=True)
 
-   tree_optimizer = SklearnOptimizer(clf_class=DecisionTreeClassifier,
-                                    hyperparam_space=hyperparam_space,
-                                    features=X, labels=y)
+   tree_optimizer = Optimizer(estimator_class=DecisionTreeClassifier,
+                              hyperparam_space=hyperparam_space,
+                              features=X, labels=y)
    tree_optimizer.optimize_clf(3, 3)
 
 
-In this example, we define custom hyperparameters and create a `HyperparameterSpace` instance. We then use the `HyperparameterSpace` instance to optimize the hyperparameters of a `DecisionTreeClassifier` using the `SklearnOptimizer` class.
+In this example, we define custom hyperparameters and create a `HyperparameterSpace` instance. We then use the `HyperparameterSpace` instance to optimize the hyperparameters of a `DecisionTreeClassifier` using the `Optimizer` class.
diff --git a/docs/sections/Concepts/index.rst b/docs/sections/Concepts/index.rst
@@ -4,6 +4,35 @@ Concepts
 Concepts are the building blocks of the hyperparameter optimization
 framework. They are used to define the search space and the score function.
 
+.. mermaid::
+
+   classDiagram
+       class Optimizer{
+         +estimator_class estimator_class
+         +HyperparameterSpace hyperspace
+         +Tracker tracker
+         +Evaluator evaluator
+         +IndividualUtils individual_utils
+         optimize_clf()
+       }
+       class HyperparameterSpace{
+         +dict fixed_hyperparams
+         +dict evolvable_hyperparams
+         from_json()
+         to_json()
+       }
+       class Evaluator{
+         evaluate()
+         evaluate_individual()
+       }
+       class IndividualUtils{
+         individual2dict()
+         get_clf()
+       }
+       Optimizer "1" --o "1" HyperparameterSpace
+       Optimizer "1" --o "1" Evaluator
+       Optimizer "1" --o "1" IndividualUtils
+
 
 .. toctree::
    :hidden:

diff --git a/docs/sections/Concepts/parallel.rst b/docs/sections/Concepts/parallel.rst
@@ -18,7 +18,7 @@ An example of the speedup that can be achieved using parallel processing is show
 
 .. code-block:: python
 
-    from mloptimizer.genoptimizer import SklearnOptimizer
+    from mloptimizer.core import Optimizer
     from mloptimizer.hyperparams import HyperparameterSpace
     from sklearn.tree import DecisionTreeClassifier
     from sklearn.datasets import load_iris
@@ -35,14 +35,14 @@ An example of the speedup that can be achieved using parallel processing is show
     population = 50
     generations = 4
 
-    opt_with_parallel = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y,
+    opt_with_parallel = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y,
                                   hyperparam_space=hyperparameter_space, seed=my_seed, use_parallel=True)
 
     start_time_parallel = time.time()
     clf_with_parallel = opt_with_parallel.optimize_clf(population, generations)
     end_time_parallel = time.time()
 
-    opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y,
+    opt = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y,
                     hyperparam_space=hyperparameter_space, seed=my_seed, use_parallel=False)
     start_time = time.time()
     clf = opt.optimize_clf(population, generations)

diff --git a/docs/sections/Concepts/reproducibility.rst b/docs/sections/Concepts/reproducibility.rst
@@ -17,7 +17,7 @@ An example of usage is:
 
     from sklearn.datasets import load_breast_cancer as dataset
     from sklearn.tree import DecisionTreeClassifier
-    from mloptimizer.genoptimizer import SklearnOptimizer
+    from mloptimizer.core import Optimizer
     from mloptimizer.hyperparams import HyperparameterSpace
 
     X, y = load_iris(return_X_y=True)
@@ -28,19 +28,19 @@ An example of usage is:
     distinct_seed = 2
     # It is important to run the optimization
     # right after the creation of the optimizer
-    optimizer1 = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y,
-                                  hyperparam_space=default_hyperparam_space, seed=seed)
+    optimizer1 = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y,
+                           hyperparam_space=default_hyperparam_space, seed=seed)
     result1 = optimizer1.optimize_clf(population=population,
                                       generations=generations)
     # WARNING: In case the optimizer2 would be created after the optimizer1,
     # the results would be different
-    optimizer2 = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y,
-                                  hyperparam_space=default_hyperparam_space, seed=seed)
+    optimizer2 = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y,
+                           hyperparam_space=default_hyperparam_space, seed=seed)
     result2 = optimizer2.optimize_clf(population=population,
                                       generations=generations)
 
-    optimizer3 = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y,
-                                  hyperparam_space=default_hyperparam_space, seed=distinct_seed)
+    optimizer3 = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y,
+                           hyperparam_space=default_hyperparam_space, seed=distinct_seed)
     result3 = optimizer3.optimize_clf(population=population,
                                       generations=generations)
     str(result1) == str(result2)

diff --git a/docs/sections/Concepts/score_functions.rst b/docs/sections/Concepts/score_functions.rst
@@ -1,10 +1,11 @@
-====================
-Score Functions
-====================
+=============================
+Score Functions (NEED UPDATE)
+=============================
 
 The `model_evaluation.py` module in our library provides several score functions that are used to evaluate the performance of machine learning algorithms. These score functions are crucial in the context of genetic optimization, where they serve as fitness values. In genetic optimization, a fitness value determines how well an individual (in this case, a machine learning algorithm defined by its hyperparameters) performs in a given generation. The better the fitness value, the more likely the individual is to survive and reproduce in the next generation.
 
 A score function takes as input:
+
 - The true labels of the data
 - The predicted labels of the data
 - A machine learning algorithm complying with the scikit-learn API

diff --git a/examples/plot_evolution.py b/examples/plot_evolution.py
@@ -4,10 +4,10 @@
 mloptimizer provides a function to plot the evolution of the fitness function.
 """
 
-from mloptimizer.genoptimizer import SklearnOptimizer
+from mloptimizer.core import Optimizer
 from mloptimizer.hyperparams import HyperparameterSpace
 from sklearn.tree import DecisionTreeClassifier
-from mloptimizer.plots import plotly_logbook
+from mloptimizer.aux.plots import plotly_logbook
 import plotly
 import os
 from sklearn.datasets import load_iris
@@ -25,8 +25,8 @@
 
 # %%
 # We use the default TreeOptimizer class to optimize a decision tree classifier.
-opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y,
-                       hyperparam_space=hyperparam_space, folder="Evolution_example")
+opt = Optimizer(estimator_class=DecisionTreeClassifier, features=X, labels=y,
+                hyperparam_space=hyperparam_space, folder="Evolution_example")
 
 # %%
 # To optimizer the classifier we need to call the optimize_clf method.
@@ -40,7 +40,7 @@
 # The black lines represent the max and min fitness values across all generations.
 # The green, red and blue line are respectively the max, min and avg fitness value for each generation.
 # Each grey point in the graph represents an individual.
-population_df = opt.population_2_df()
+population_df = opt.runs[-1].population_2_df()
 g_logbook = plotly_logbook(opt.logbook, population_df)
 plotly.io.show(g_logbook)
 

diff --git a/examples/plot_quickstart.py b/examples/plot_quickstart.py
@@ -5,18 +5,22 @@
 Firstly, we import the necessary libraries to get data and plot the results.
 """
 
-from mloptimizer.genoptimizer import SklearnOptimizer
+from mloptimizer.core import Optimizer
 from mloptimizer.hyperparams import HyperparameterSpace
 from sklearn.tree import DecisionTreeClassifier
-from mloptimizer.plots import plotly_logbook, plotly_search_space
-import plotly
 from sklearn.datasets import load_iris
 
 # %%
 # Load the iris dataset to obtain a vector of features X and a vector of labels y.
 # Another dataset or a custom one can be used
 X, y = load_iris(return_X_y=True)
 
+# %%
+# Split the dataset into training and test sets
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+
 # %%
 # Define the HyperparameterSpace, you can use the default hyperparameters for the machine learning model
 # that you want to optimize. In this case we use the default hyperparameters for a DecisionTreeClassifier.
@@ -29,16 +33,34 @@
 # the second is the vector of labels and
 # the third (if provided) is the name of the folder where the results of mloptimizer Optimizers are saved.
 # The default value for this folder is "Optimizer"
-opt = SklearnOptimizer(clf_class=DecisionTreeClassifier, features=X, labels=y,
-                       hyperparam_space=hyperparam_space, folder="Optimizer")
+opt = Optimizer(estimator_class=DecisionTreeClassifier, features=X_train, labels=y_train,
+                hyperparam_space=hyperparam_space, folder="Optimizer")
 
 # %%
 # To optimizer the classifier we need to call the optimize_clf method.
 # The first argument is the number of generations and
 # the second is the number of individuals in each generation.
+# The method returns the best classifier with the best hyperparameters found.
 clf = opt.optimize_clf(10, 10)
 
+print(clf)
+
 # %%
-# The structure of the Optimizer folder is as follows:
+# Train the classifier with the best hyperparameters found
+# Show the classification report and the confusion matrix
+from sklearn.metrics import classification_report, confusion_matrix, \
+    ConfusionMatrixDisplay
+import matplotlib.pyplot as plt
+
+clf.fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+cm = confusion_matrix(y_test, y_pred)
+print(classification_report(y_test, y_pred))
+disp = ConfusionMatrixDisplay.from_predictions(
+    y_test, y_pred, display_labels=clf.classes_,
+    cmap=plt.cm.Blues
+)
+disp.plot()
+plt.show()
 
 del opt