From 6bf81df1d593df07cc29838e61265414b6bf8958 Mon Sep 17 00:00:00 2001 From: W0lfgunbl00d Date: Tue, 5 Nov 2024 14:49:32 +0100 Subject: [PATCH 01/24] v1test implemented the rls, no tests yet --- river/linear_model/__init__.py | 2 ++ river/linear_model/rls.py | 46 ++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 river/linear_model/rls.py diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index 756720490a..e74d0d1439 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -10,6 +10,7 @@ from .pa import PAClassifier, PARegressor from .perceptron import Perceptron from .softmax import SoftmaxRegression +from .rls import RLS __all__ = [ "base", @@ -21,4 +22,5 @@ "PARegressor", "Perceptron", "SoftmaxRegression", + "RLS", ] diff --git a/river/linear_model/rls.py b/river/linear_model/rls.py new file mode 100644 index 0000000000..71e189b0b3 --- /dev/null +++ b/river/linear_model/rls.py @@ -0,0 +1,46 @@ +import numpy as np + +class RLS(object): + + def __init__(self, p: int, l: float, delta: float): + self.p = p + self.l = l + self.delta = delta + + self.currentStep = 0 + + self.x = np.zeros((p + 1, 1)) # Column vector + self.P = np.identity(p + 1) * self.delta + + self.estimates = [] + self.estimates.append(np.zeros((p + 1, 1))) # Weight vector initialized to zeros + + self.Pks = [] + self.Pks.append(self.P) + + def estimate(self, xn: float, dn: float): + # Update input vector + self.x = np.roll(self.x, -1) + self.x[-1, 0] = xn + + # Get previous weight vector + wn_prev = self.estimates[-1] + + # Compute gain vector + denominator = self.l + self.x.T @ self.Pks[-1] @ self.x + gn = (self.Pks[-1] @ self.x) / denominator + + # Compute a priori error + alpha = dn - (self.x.T @ wn_prev) + + # Update inverse correlation matrix + Pn = (self.Pks[-1] - gn @ self.x.T @ self.Pks[-1]) / self.l + self.Pks.append(Pn) + + # Update weight vector + wn = wn_prev + gn * alpha + self.estimates.append(wn) + + self.currentStep += 1 + + return wn From 6aa486973fda441ef55df36389d34adeb71d02fc Mon Sep 17 00:00:00 2001 From: W0lfgunbl00d Date: Tue, 5 Nov 2024 14:54:51 +0100 Subject: [PATCH 02/24] Update rls.py comments --- river/linear_model/rls.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/river/linear_model/rls.py b/river/linear_model/rls.py index 71e189b0b3..9aa69410e1 100644 --- a/river/linear_model/rls.py +++ b/river/linear_model/rls.py @@ -1,11 +1,12 @@ import numpy as np + class RLS(object): def __init__(self, p: int, l: float, delta: float): - self.p = p - self.l = l - self.delta = delta + self.p = p # Filter order + self.l = l # Forgetting factor + self.delta = delta # Value to initialise P(0) self.currentStep = 0 From 814fd8a9ceadb5be3933c9f0f510161d4818f01a Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:19:07 +0100 Subject: [PATCH 03/24] Added an v0 adpredictor --- river/base/Adpredictor.py | 73 ++++++++++++ river/base/Adpredictor_test.ipynb | 189 ++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+) create mode 100644 river/base/Adpredictor.py create mode 100644 river/base/Adpredictor_test.ipynb diff --git a/river/base/Adpredictor.py b/river/base/Adpredictor.py new file mode 100644 index 0000000000..f58d326d14 --- /dev/null +++ b/river/base/Adpredictor.py @@ -0,0 +1,73 @@ +import numpy as np +from river import base +from river import optim +from collections import defaultdict +import logging +from collections import namedtuple +from river import compose, preprocessing, metrics + +logger = logging.getLogger(__name__) + +class AdPredictor(base.Classifier): + config = namedtuple('config', ['beta', 'prior_probability', 'epsilon', 'num_features']) + def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): + self.beta = beta + self.prior_probability = prior_probability + self.epsilon = epsilon + self.num_features = num_features + self.weights = defaultdict(lambda: {'mean': 0.0, 'variance': 1.0}) + self.bias_weight = self.prior_bias_weight() + + def prior_bias_weight(self): + return np.log(self.prior_probability / (1 - self.prior_probability)) / self.beta + + def _active_mean_variance(self, features): + total_mean = sum(self.weights[f]['mean'] for f in features) + self.bias_weight + total_variance = sum(self.weights[f]['variance'] for f in features) + self.beta ** 2 + return total_mean, total_variance + + def predict_one(self, x): + features = x.keys() + total_mean, total_variance = self._active_mean_variance(features) + return 1 / (1 + np.exp(-total_mean / np.sqrt(total_variance))) + + def learn_one(self, x, y): + features = x.keys() + y = 1 if y else -1 # Map label to ±1 for binary classification + total_mean, total_variance = self._active_mean_variance(features) + v, w = self.gaussian_corrections(y * total_mean / np.sqrt(total_variance)) + + for feature in features: + mean = self.weights[feature]['mean'] + variance = self.weights[feature]['variance'] + + mean_delta = y * variance / np.sqrt(total_variance) * v + variance_multiplier = 1.0 - variance / total_variance * w + + # Update weight + self.weights[feature]['mean'] = mean + mean_delta + self.weights[feature]['variance'] = variance * variance_multiplier + + def gaussian_corrections(self, score): + """Compute Gaussian corrections for Bayesian update.""" + cdf = 1 / (1 + np.exp(-score)) + pdf = np.exp(-0.5 * score ** 2) / np.sqrt(2 * np.pi) + v = pdf / cdf + w = v * (v + score) + return v, w + + def _apply_dynamics(self, weight): + prior_variance = 1.0 + adjusted_variance = weight['variance'] * prior_variance / \ + ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight['variance']) + adjusted_mean = adjusted_variance * ( + (1.0 - self.epsilon) * weight['mean'] / weight['variance'] + + self.epsilon * 0 / prior_variance) + return {'mean': adjusted_mean, 'variance': adjusted_variance} + + def __str__(self): + return "AdPredictor" + + + + diff --git a/river/base/Adpredictor_test.ipynb b/river/base/Adpredictor_test.ipynb new file mode 100644 index 0000000000..b0897c1d3c --- /dev/null +++ b/river/base/Adpredictor_test.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-11-09T15:04:34.707796Z", + "start_time": "2024-11-09T15:04:34.663651Z" + } + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "execution_count": 9 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:37.139365Z", + "start_time": "2024-11-09T15:04:37.097733Z" + } + }, + "cell_type": "code", + "source": [ + "import os\n", + "print(os.getcwd())" + ], + "id": "65e3111b76277fc5", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C:\\River\\riverIDLIB\\river\\base\n" + ] + } + ], + "execution_count": 10 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:38.751355Z", + "start_time": "2024-11-09T15:04:38.708257Z" + } + }, + "cell_type": "code", + "source": [ + "from river import datasets\n", + "from river import metrics\n", + "from river import preprocessing\n", + "from river import compose\n" + ], + "id": "3ffeadeef731f48e", + "outputs": [], + "execution_count": 11 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:45.986384Z", + "start_time": "2024-11-09T15:04:45.943819Z" + } + }, + "cell_type": "code", + "source": [ + "import importlib.util\n", + "\n", + "spec = importlib.util.spec_from_file_location(\"AdPredictor\", \"./AdPredictor.py\")\n", + "AdPredictor_module = importlib.util.module_from_spec(spec)\n", + "spec.loader.exec_module(AdPredictor_module)\n", + "\n", + "AdPredictor = AdPredictor_module.AdPredictor" + ], + "id": "e323aa048e864b33", + "outputs": [], + "execution_count": 12 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:49.810249Z", + "start_time": "2024-11-09T15:04:49.768766Z" + } + }, + "cell_type": "code", + "source": "phishing_data = datasets.Phishing()", + "id": "c862e33d656cb230", + "outputs": [], + "execution_count": 13 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:54.402788Z", + "start_time": "2024-11-09T15:04:54.358522Z" + } + }, + "cell_type": "code", + "source": [ + "model = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=9)\n", + "metric = metrics.Accuracy()" + ], + "id": "293c681cca67e1f4", + "outputs": [], + "execution_count": 14 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:04:59.609037Z", + "start_time": "2024-11-09T15:04:59.569533Z" + } + }, + "cell_type": "code", + "source": [ + "model_pipeline = compose.Pipeline(\n", + " ('scale', preprocessing.StandardScaler()),\n", + " ('predictor',model)\n", + ")" + ], + "id": "c8fb1bb8ed902d80", + "outputs": [], + "execution_count": 15 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-09T15:03:47.107046Z", + "start_time": "2024-11-09T15:03:47.054657Z" + } + }, + "cell_type": "code", + "source": [ + "for x, y in phishing_data:\n", + " y_pred = model_pipeline.predict_one(x)\n", + " metric = metric.update(y, y_pred)\n", + " model_pipeline = model_pipeline.learn_one(x, y)\n", + " print(f'Prediction: {y_pred}, Metric: {metric}')\n", + " break " + ], + "id": "1dea7c542ab4ad84", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediction: 0.5, Metric: None\n" + ] + } + ], + "execution_count": 8 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b30974791713c556c7c1afb194ac28fdb9bac4ed Mon Sep 17 00:00:00 2001 From: Mo3ad-S <155067453+Mo3ad-S@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:22:11 +0100 Subject: [PATCH 04/24] Added an v0 adpredictor --- river/base/Adpredictor.py | 49 +++++++------ river/base/Adpredictor_test.ipynb | 113 +++++++++++++++--------------- 2 files changed, 82 insertions(+), 80 deletions(-) diff --git a/river/base/Adpredictor.py b/river/base/Adpredictor.py index f58d326d14..4702760cf8 100644 --- a/river/base/Adpredictor.py +++ b/river/base/Adpredictor.py @@ -1,29 +1,32 @@ +from __future__ import annotations + +import logging +from collections import defaultdict, namedtuple + import numpy as np + from river import base -from river import optim -from collections import defaultdict -import logging -from collections import namedtuple -from river import compose, preprocessing, metrics logger = logging.getLogger(__name__) + class AdPredictor(base.Classifier): - config = namedtuple('config', ['beta', 'prior_probability', 'epsilon', 'num_features']) + config = namedtuple("config", ["beta", "prior_probability", "epsilon", "num_features"]) + def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): self.beta = beta self.prior_probability = prior_probability self.epsilon = epsilon self.num_features = num_features - self.weights = defaultdict(lambda: {'mean': 0.0, 'variance': 1.0}) + self.weights = defaultdict(lambda: {"mean": 0.0, "variance": 1.0}) self.bias_weight = self.prior_bias_weight() def prior_bias_weight(self): return np.log(self.prior_probability / (1 - self.prior_probability)) / self.beta def _active_mean_variance(self, features): - total_mean = sum(self.weights[f]['mean'] for f in features) + self.bias_weight - total_variance = sum(self.weights[f]['variance'] for f in features) + self.beta ** 2 + total_mean = sum(self.weights[f]["mean"] for f in features) + self.bias_weight + total_variance = sum(self.weights[f]["variance"] for f in features) + self.beta**2 return total_mean, total_variance def predict_one(self, x): @@ -38,36 +41,36 @@ def learn_one(self, x, y): v, w = self.gaussian_corrections(y * total_mean / np.sqrt(total_variance)) for feature in features: - mean = self.weights[feature]['mean'] - variance = self.weights[feature]['variance'] + mean = self.weights[feature]["mean"] + variance = self.weights[feature]["variance"] mean_delta = y * variance / np.sqrt(total_variance) * v variance_multiplier = 1.0 - variance / total_variance * w # Update weight - self.weights[feature]['mean'] = mean + mean_delta - self.weights[feature]['variance'] = variance * variance_multiplier + self.weights[feature]["mean"] = mean + mean_delta + self.weights[feature]["variance"] = variance * variance_multiplier def gaussian_corrections(self, score): """Compute Gaussian corrections for Bayesian update.""" cdf = 1 / (1 + np.exp(-score)) - pdf = np.exp(-0.5 * score ** 2) / np.sqrt(2 * np.pi) + pdf = np.exp(-0.5 * score**2) / np.sqrt(2 * np.pi) v = pdf / cdf w = v * (v + score) return v, w def _apply_dynamics(self, weight): prior_variance = 1.0 - adjusted_variance = weight['variance'] * prior_variance / \ - ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight['variance']) + adjusted_variance = ( + weight["variance"] + * prior_variance + / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"]) + ) adjusted_mean = adjusted_variance * ( - (1.0 - self.epsilon) * weight['mean'] / weight['variance'] + - self.epsilon * 0 / prior_variance) - return {'mean': adjusted_mean, 'variance': adjusted_variance} + (1.0 - self.epsilon) * weight["mean"] / weight["variance"] + + self.epsilon * 0 / prior_variance + ) + return {"mean": adjusted_mean, "variance": adjusted_variance} def __str__(self): return "AdPredictor" - - - - diff --git a/river/base/Adpredictor_test.ipynb b/river/base/Adpredictor_test.ipynb index b0897c1d3c..5a6b4b4ab9 100644 --- a/river/base/Adpredictor_test.ipynb +++ b/river/base/Adpredictor_test.ipynb @@ -2,18 +2,15 @@ "cells": [ { "cell_type": "code", + "execution_count": 9, "id": "initial_id", "metadata": { - "collapsed": true, "ExecuteTime": { "end_time": "2024-11-09T15:04:34.707796Z", "start_time": "2024-11-09T15:04:34.663651Z" - } + }, + "collapsed": true }, - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n" - ], "outputs": [ { "name": "stdout", @@ -24,21 +21,23 @@ ] } ], - "execution_count": 9 + "source": [ + "from __future__ import annotations\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] }, { + "cell_type": "code", + "execution_count": 10, + "id": "65e3111b76277fc5", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:37.139365Z", "start_time": "2024-11-09T15:04:37.097733Z" } }, - "cell_type": "code", - "source": [ - "import os\n", - "print(os.getcwd())" - ], - "id": "65e3111b76277fc5", "outputs": [ { "name": "stdout", @@ -48,34 +47,38 @@ ] } ], - "execution_count": 10 + "source": [ + "import os\n", + "\n", + "print(os.getcwd())" + ] }, { + "cell_type": "code", + "execution_count": 11, + "id": "3ffeadeef731f48e", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:38.751355Z", "start_time": "2024-11-09T15:04:38.708257Z" } }, - "cell_type": "code", - "source": [ - "from river import datasets\n", - "from river import metrics\n", - "from river import preprocessing\n", - "from river import compose\n" - ], - "id": "3ffeadeef731f48e", "outputs": [], - "execution_count": 11 + "source": [ + "from river import compose, datasets, metrics, preprocessing" + ] }, { + "cell_type": "code", + "execution_count": 12, + "id": "e323aa048e864b33", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:45.986384Z", "start_time": "2024-11-09T15:04:45.943819Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "import importlib.util\n", "\n", @@ -84,75 +87,64 @@ "spec.loader.exec_module(AdPredictor_module)\n", "\n", "AdPredictor = AdPredictor_module.AdPredictor" - ], - "id": "e323aa048e864b33", - "outputs": [], - "execution_count": 12 + ] }, { + "cell_type": "code", + "execution_count": 13, + "id": "c862e33d656cb230", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:49.810249Z", "start_time": "2024-11-09T15:04:49.768766Z" } }, - "cell_type": "code", - "source": "phishing_data = datasets.Phishing()", - "id": "c862e33d656cb230", "outputs": [], - "execution_count": 13 + "source": [ + "phishing_data = datasets.Phishing()" + ] }, { + "cell_type": "code", + "execution_count": 14, + "id": "293c681cca67e1f4", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:54.402788Z", "start_time": "2024-11-09T15:04:54.358522Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "model = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=9)\n", "metric = metrics.Accuracy()" - ], - "id": "293c681cca67e1f4", - "outputs": [], - "execution_count": 14 + ] }, { + "cell_type": "code", + "execution_count": 15, + "id": "c8fb1bb8ed902d80", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:04:59.609037Z", "start_time": "2024-11-09T15:04:59.569533Z" } }, - "cell_type": "code", - "source": [ - "model_pipeline = compose.Pipeline(\n", - " ('scale', preprocessing.StandardScaler()),\n", - " ('predictor',model)\n", - ")" - ], - "id": "c8fb1bb8ed902d80", "outputs": [], - "execution_count": 15 + "source": [ + "model_pipeline = compose.Pipeline((\"scale\", preprocessing.StandardScaler()), (\"predictor\", model))" + ] }, { + "cell_type": "code", + "execution_count": 8, + "id": "1dea7c542ab4ad84", "metadata": { "ExecuteTime": { "end_time": "2024-11-09T15:03:47.107046Z", "start_time": "2024-11-09T15:03:47.054657Z" } }, - "cell_type": "code", - "source": [ - "for x, y in phishing_data:\n", - " y_pred = model_pipeline.predict_one(x)\n", - " metric = metric.update(y, y_pred)\n", - " model_pipeline = model_pipeline.learn_one(x, y)\n", - " print(f'Prediction: {y_pred}, Metric: {metric}')\n", - " break " - ], - "id": "1dea7c542ab4ad84", "outputs": [ { "name": "stdout", @@ -162,7 +154,14 @@ ] } ], - "execution_count": 8 + "source": [ + "for x, y in phishing_data:\n", + " y_pred = model_pipeline.predict_one(x)\n", + " metric = metric.update(y, y_pred)\n", + " model_pipeline = model_pipeline.learn_one(x, y)\n", + " print(f\"Prediction: {y_pred}, Metric: {metric}\")\n", + " break" + ] } ], "metadata": { From 27b55e9ba5fd15bc3358473ed001bab50660fbca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Sun, 10 Nov 2024 19:13:07 +0100 Subject: [PATCH 05/24] Ajouts fichier + test v0 --- river/active/te.py | 5 ++ river/model_selection/hoeffding_races.py | 56 +++++++++++++ river/model_selection/hoeffding_races_test.py | 28 +++++++ setup.py | 82 +++++++++++++++++++ 4 files changed, 171 insertions(+) create mode 100644 river/active/te.py create mode 100644 river/model_selection/hoeffding_races.py create mode 100644 river/model_selection/hoeffding_races_test.py create mode 100644 setup.py diff --git a/river/active/te.py b/river/active/te.py new file mode 100644 index 0000000000..583e983a4e --- /dev/null +++ b/river/active/te.py @@ -0,0 +1,5 @@ +from river import linear_model + +model = linear_model.LinearRegression() + +print(model) \ No newline at end of file diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py new file mode 100644 index 0000000000..871e9b4711 --- /dev/null +++ b/river/model_selection/hoeffding_races.py @@ -0,0 +1,56 @@ +import math +from river import metrics, base + + +class HoeffdingRace(base.Estimator): + def __init__(self, models, delta=0.05, metric=metrics.Accuracy()): + # Initialisation des modèles, du delta, de la métrique par défaut et des variables de suivi + self.models = models + self.delta = delta + self.metric = metric # Métrique de base (peut être utilisée pour chaque modèle) + self.n_obs = 0 # Compteur + # Initialiser des métriques distinctes pour chaque modèle + self.model_metrics = {name: metric.clone() for name in models.keys()} + self.model_performance = {name: 0 for name in models.keys()} # Suivi des performances des modèles + self.remaining_models = set(models.keys()) # Liste des modèles restant + + + def hoeffding_bound(self, n): + return math.sqrt((math.log(1 / self.delta)) / (2 * n)) + + def learn_one(self, x, y): + self.n_obs += 1 + best_perf = max(self.model_performance.values()) / self.n_obs if self.n_obs > 0 else 0 + + for name in list(self.remaining_models): + y_pred = self.models[name].predict_one(x) + + self.models[name].learn_one(x, y) + + # Update performance + result = self.model_metrics[name].update(y, y_pred) + + self.model_performance[name] += self.model_metrics[name].get() / self.n_obs + + # Elimination check + avg_perf = self.model_performance[name] + + if avg_perf + self.hoeffding_bound(self.n_obs) < best_perf: + self.remaining_models.remove(name) + print(f"{name} éliminé après {self.n_obs} observations") + + + + # Early stopping if only one model remains + if len(self.remaining_models) == 1: + print(f"{list(self.remaining_models)[0]} est le modèle sélectionné.") + return list(self.remaining_models)[0] + + + + def predict_one(self, x): + # Prediction by best remaining model + if len(self.remaining_models) == 1: + return self.models[list(self.remaining_models)[0]].predict_one(x) + return None # Pas de prédiction tant qu'un modèle n'est pas sélectionné + diff --git a/river/model_selection/hoeffding_races_test.py b/river/model_selection/hoeffding_races_test.py new file mode 100644 index 0000000000..8072f2a52a --- /dev/null +++ b/river/model_selection/hoeffding_races_test.py @@ -0,0 +1,28 @@ +from river import linear_model, neighbors, tree +from river.datasets import synth +from river import datasets +from river import metrics +from hoeffding_races import HoeffdingRace + +metric = metrics.Accuracy() + +# Modèles candidats +models = { + 'Regression': linear_model.LogisticRegression(), + 'KNN': neighbors.KNNClassifier(), + 'DecisionTree': tree.HoeffdingTreeClassifier() +} + +# Initialisation de HoeffdingRace + +hoeffding_race = HoeffdingRace(models=models,metric=metric) + +# Exécution sur un flux de données +dataset = datasets.CreditCard() +n=0 +for x,y in dataset: + selected_model = hoeffding_race.learn_one(x, y) + if selected_model: + break +print(n) +print(hoeffding_race.model_performance) \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000..65fe00ee98 --- /dev/null +++ b/setup.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +from setuptools import setup + +packages = \ +['river', + 'river.active', + 'river.anomaly', + 'river.bandit', + 'river.bandit.datasets', + 'river.bandit.envs', + 'river.base', + 'river.checks', + 'river.cluster', + 'river.compat', + 'river.compose', + 'river.conf', + 'river.covariance', + 'river.datasets', + 'river.datasets.synth', + 'river.drift', + 'river.drift.binary', + 'river.drift.datasets', + 'river.ensemble', + 'river.evaluate', + 'river.facto', + 'river.feature_extraction', + 'river.feature_selection', + 'river.forest', + 'river.imblearn', + 'river.linear_model', + 'river.metrics', + 'river.metrics.efficient_rollingrocauc', + 'river.metrics.multioutput', + 'river.misc', + 'river.model_selection', + 'river.multiclass', + 'river.multioutput', + 'river.naive_bayes', + 'river.neighbors', + 'river.neighbors.ann', + 'river.neural_net', + 'river.optim', + 'river.preprocessing', + 'river.proba', + 'river.reco', + 'river.rules', + 'river.sketch', + 'river.stats', + 'river.stream', + 'river.time_series', + 'river.tree', + 'river.tree.mondrian', + 'river.tree.nodes', + 'river.tree.split_criterion', + 'river.tree.splitter', + 'river.utils'] + +package_data = \ +{'': ['*'], 'river.metrics': ['efficient_rollingrocauc/cpp/*']} + +install_requires = \ +['numpy>=1.23.0,<2.0.0', 'pandas>=2.1,<3.0', 'scipy>=1.12.1,<2.0.0'] + +setup_kwargs = { + 'name': 'river', + 'version': '0.21.2', + 'description': 'Online machine learning in Python', + 'long_description': '

\n river_logo\n

\n\n

\n \n \n unit-tests\n \n \n \n code-quality\n \n \n \n documentation\n \n \n \n discord\n \n \n \n pypi\n \n \n \n pepy\n \n \n \n black\n \n \n \n mypy\n \n \n \n bsd_3_license\n \n

\n\n
\n\n

\n River is a Python library for online machine learning. It aims to be the most user-friendly library for doing machine learning on streaming data. River is the result of a merger between creme and scikit-multiflow.\n

\n\n## ⚡️ Quickstart\n\nAs a quick example, we\'ll train a logistic regression to classify the [website phishing dataset](http://archive.ics.uci.edu/ml/datasets/Website+Phishing). Here\'s a look at the first observation in the dataset.\n\n```python\n>>> from pprint import pprint\n>>> from river import datasets\n\n>>> dataset = datasets.Phishing()\n\n>>> for x, y in dataset:\n... pprint(x)\n... print(y)\n... break\n{\'age_of_domain\': 1,\n \'anchor_from_other_domain\': 0.0,\n \'empty_server_form_handler\': 0.0,\n \'https\': 0.0,\n \'ip_in_url\': 1,\n \'is_popular\': 0.5,\n \'long_url\': 1.0,\n \'popup_window\': 0.0,\n \'request_from_other_domain\': 0.0}\nTrue\n\n```\n\nNow let\'s run the model on the dataset in a streaming fashion. We sequentially interleave predictions and model updates. Meanwhile, we update a performance metric to see how well the model is doing.\n\n```python\n>>> from river import compose\n>>> from river import linear_model\n>>> from river import metrics\n>>> from river import preprocessing\n\n>>> model = compose.Pipeline(\n... preprocessing.StandardScaler(),\n... linear_model.LogisticRegression()\n... )\n\n>>> metric = metrics.Accuracy()\n\n>>> for x, y in dataset:\n... y_pred = model.predict_one(x) # make a prediction\n... metric.update(y, y_pred) # update the metric\n... model.learn_one(x, y) # make the model learn\n\n>>> metric\nAccuracy: 89.28%\n\n```\n\nOf course, this is just a contrived example. We welcome you to check the [introduction](https://riverml.xyz/dev/introduction/installation/) section of the documentation for a more thorough tutorial.\n\n## 🛠 Installation\n\nRiver is intended to work with **Python 3.8 and above**. Installation can be done with `pip`:\n\n```sh\npip install river\n```\n\nThere are [wheels available](https://pypi.org/project/river/#files) for Linux, MacOS, and Windows. This means you most probably won\'t have to build River from source.\n\nYou can install the latest development version from GitHub as so:\n\n```sh\npip install git+https://github.com/online-ml/river --upgrade\npip install git+ssh://git@github.com/online-ml/river.git --upgrade # using SSH\n```\n\nThis method requires having Cython and Rust installed on your machine.\n\n## 🔮 Features\n\nRiver provides online implementations of the following family of algorithms:\n\n- Linear models, with a wide array of optimizers\n- Decision trees and random forests\n- (Approximate) nearest neighbors\n- Anomaly detection\n- Drift detection\n- Recommender systems\n- Time series forecasting\n- Bandits\n- Factorization machines\n- Imbalanced learning\n- Clustering\n- Bagging/boosting/stacking\n- Active learning\n\nRiver also provides other online utilities:\n\n- Feature extraction and selection\n- Online statistics and metrics\n- Preprocessing\n- Built-in datasets\n- Progressive model validation\n- Model pipelines\n\nCheck out [the API](https://riverml.xyz/latest/api/overview/) for a comprehensive overview\n\n## 🤔 Should I be using River?\n\nYou should ask yourself if you need online machine learning. The answer is likely no. Most of the time batch learning does the job just fine. An online approach might fit the bill if:\n\n- You want a model that can learn from new data without having to revisit past data.\n- You want a model which is robust to [concept drift](https://www.wikiwand.com/en/Concept_drift).\n- You want to develop your model in a way that is closer to what occurs in a production context, which is usually event-based.\n\nSome specificities of River are that:\n\n- It focuses on clarity and user experience, more so than performance.\n- It\'s very fast at processing one sample at a time. Try it, you\'ll see.\n- It plays nicely with the rest of Python\'s ecosystem.\n\n## 🔗 Useful links\n\n- [Documentation](https://riverml.xyz)\n- [Package releases](https://pypi.org/project/river/#history)\n- [awesome-online-machine-learning](https://github.com/online-ml/awesome-online-machine-learning)\n- [2022 presentation at GAIA](https://www.youtube.com/watch?v=nzFTmJnIakk&list=PLIU25-FciwNaz5PqWPiHmPCMOFYoEsJ8c&index=5)\n- [Online Clustering: Algorithms, Evaluation, Metrics, Applications and Benchmarking](https://dl.acm.org/doi/10.1145/3534678.3542600) from [KDD\'22](https://kdd.org/kdd2022/).\n\n## 👐 Contributing\n\nFeel free to contribute in any way you like, we\'re always open to new ideas and approaches.\n\n- [Open a discussion](https://github.com/online-ml/river/discussions/new) if you have any question or enquiry whatsoever. It\'s more useful to ask your question in public rather than sending us a private email. It\'s also encouraged to open a discussion before contributing, so that everyone is aligned and unnecessary work is avoided.\n- Feel welcome to [open an issue](https://github.com/online-ml/river/issues/new/choose) if you think you\'ve spotted a bug or a performance issue.\n- Our [roadmap](https://github.com/orgs/online-ml/projects/3?query=is%3Aopen+sort%3Aupdated-desc) is public. Feel free to work on anything that catches your eye, or to make suggestions.\n\nPlease check out the [contribution guidelines](https://github.com/online-ml/river/blob/main/CONTRIBUTING.md) if you want to bring modifications to the code base.\n\n## 🤝 Affiliations\n\n

\n affiliations\n

\n\n## 💬 Citation\n\nIf River has been useful to you, and you would like to cite it in a scientific publication, please refer to the [paper](https://www.jmlr.org/papers/volume22/20-1380/20-1380.pdf) published at JMLR:\n\n```bibtex\n@article{montiel2021river,\n title={River: machine learning for streaming data in Python},\n author={Montiel, Jacob and Halford, Max and Mastelini, Saulo Martiello\n and Bolmier, Geoffrey and Sourty, Raphael and Vaysse, Robin and Zouitine, Adil\n and Gomes, Heitor Murilo and Read, Jesse and Abdessalem, Talel and others},\n year={2021}\n}\n```\n\n## 📝 License\n\nRiver is free and open-source software licensed under the [3-clause BSD license](https://github.com/online-ml/river/blob/main/LICENSE).\n', + 'author': 'Max Halford', + 'author_email': 'maxhalford25@gmail.com', + 'maintainer': 'None', + 'maintainer_email': 'None', + 'url': 'https://riverml.xyz/', + 'packages': packages, + 'package_data': package_data, + 'install_requires': install_requires, + 'python_requires': '>=3.9,<4.0', +} +from build import * +build(setup_kwargs) + +setup(**setup_kwargs) From 411eb98ee665a7bd9dfc8942030e21705736feb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Wed, 20 Nov 2024 09:00:26 +0100 Subject: [PATCH 06/24] Modification precision des performances --- river/model_selection/__init__.py | 4 ++ river/model_selection/hoeffding_races.py | 36 +++++++++++------- river/model_selection/hoeffding_races_test.py | 38 +++++++++---------- 3 files changed, 44 insertions(+), 34 deletions(-) diff --git a/river/model_selection/__init__.py b/river/model_selection/__init__.py index f010dcf5c6..9bd628c002 100644 --- a/river/model_selection/__init__.py +++ b/river/model_selection/__init__.py @@ -19,6 +19,9 @@ from .bandit import BanditClassifier, BanditRegressor from .greedy import GreedyRegressor from .sh import SuccessiveHalvingClassifier, SuccessiveHalvingRegressor +from .hoeffding_races import HoeffdingRace + + __all__ = [ "base", @@ -27,4 +30,5 @@ "GreedyRegressor", "SuccessiveHalvingClassifier", "SuccessiveHalvingRegressor", + "HoeffdingRace", ] diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index 871e9b4711..c4ac6d6336 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -1,17 +1,19 @@ import math from river import metrics, base +from river.base import Classifier -class HoeffdingRace(base.Estimator): + +class HoeffdingRace(Classifier): def __init__(self, models, delta=0.05, metric=metrics.Accuracy()): # Initialisation des modèles, du delta, de la métrique par défaut et des variables de suivi self.models = models self.delta = delta self.metric = metric # Métrique de base (peut être utilisée pour chaque modèle) - self.n_obs = 0 # Compteur - # Initialiser des métriques distinctes pour chaque modèle + self.n = 0 self.model_metrics = {name: metric.clone() for name in models.keys()} self.model_performance = {name: 0 for name in models.keys()} # Suivi des performances des modèles + self.remaining_models = set(models.keys()) # Liste des modèles restant @@ -19,25 +21,23 @@ def hoeffding_bound(self, n): return math.sqrt((math.log(1 / self.delta)) / (2 * n)) def learn_one(self, x, y): - self.n_obs += 1 - best_perf = max(self.model_performance.values()) / self.n_obs if self.n_obs > 0 else 0 + + best_perf = max(self.model_performance.values()) if self.n > 0 else 0 + self.n = self.n+1 for name in list(self.remaining_models): y_pred = self.models[name].predict_one(x) - self.models[name].learn_one(x, y) - + # Update performance result = self.model_metrics[name].update(y, y_pred) - self.model_performance[name] += self.model_metrics[name].get() / self.n_obs - - # Elimination check - avg_perf = self.model_performance[name] + self.model_performance[name] = self.model_metrics[name].get() + - if avg_perf + self.hoeffding_bound(self.n_obs) < best_perf: + if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: self.remaining_models.remove(name) - print(f"{name} éliminé après {self.n_obs} observations") + print(f"{name} éliminé après {self.n} observations") @@ -54,3 +54,13 @@ def predict_one(self, x): return self.models[list(self.remaining_models)[0]].predict_one(x) return None # Pas de prédiction tant qu'un modèle n'est pas sélectionné + def _unit_test_params(self): + """Provides default parameters for unit testing.""" + from river.linear_model import LogisticRegression + from river.metrics import Accuracy + return { + "models": {"LogisticRegression": LogisticRegression()}, + "metric": Accuracy(), + "delta": 0.05 + } + diff --git a/river/model_selection/hoeffding_races_test.py b/river/model_selection/hoeffding_races_test.py index 8072f2a52a..005bdf9f88 100644 --- a/river/model_selection/hoeffding_races_test.py +++ b/river/model_selection/hoeffding_races_test.py @@ -1,28 +1,24 @@ -from river import linear_model, neighbors, tree -from river.datasets import synth -from river import datasets -from river import metrics -from hoeffding_races import HoeffdingRace -metric = metrics.Accuracy() +from hoeffding_races import HoeffdingRace +from river.linear_model import LogisticRegression +from river.metrics import Accuracy +from river import linear_model, neighbors, tree, metrics, datasets +from river import naive_bayes -# Modèles candidats -models = { - 'Regression': linear_model.LogisticRegression(), - 'KNN': neighbors.KNNClassifier(), - 'DecisionTree': tree.HoeffdingTreeClassifier() -} -# Initialisation de HoeffdingRace -hoeffding_race = HoeffdingRace(models=models,metric=metric) +# Instantiate a HoeffdingRace object with a single candidate model +hoeffding_race = HoeffdingRace( + models = {"LogisticRegression": linear_model.LogisticRegression()}, + metric=Accuracy(), + delta=0.05 +) -# Exécution sur un flux de données -dataset = datasets.CreditCard() -n=0 -for x,y in dataset: - selected_model = hoeffding_race.learn_one(x, y) - if selected_model: +dataset = datasets.AirlinePassengers() +print(dataset) +for x, y in dataset: + hoeffding_race.learn_one(x, y) + #print(hoeffding_race.model_metrics["KNN"].get()) + if len(hoeffding_race.remaining_models) == 1: break -print(n) print(hoeffding_race.model_performance) \ No newline at end of file From b4b0b0ed08a81a752b8751ff5efa9a80c54a3600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Wed, 20 Nov 2024 09:55:16 +0100 Subject: [PATCH 07/24] Fixed bugs + remove hoeffding_races_test.py --- river/model_selection/hoeffding_races.py | 60 +++++++++++-------- river/model_selection/hoeffding_races_test.py | 24 -------- 2 files changed, 34 insertions(+), 50 deletions(-) delete mode 100644 river/model_selection/hoeffding_races_test.py diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index c4ac6d6336..a87f2bfef9 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -5,16 +5,39 @@ class HoeffdingRace(Classifier): + """ + + >>> from river import model_selection + >>> from river.linear_model import LogisticRegression + >>> from river.metrics import Accuracy + >>> from river import linear_model, neighbors, tree, metrics, datasets + >>> from river import naive_bayes + >>> hoeffding_race = model_selection.HoeffdingRace( + ... models = { + ... "KNN": neighbors.KNNClassifier(), + ... "DecisionTree":tree.HoeffdingAdaptiveTreeRegressor()}, + ... metric=Accuracy(), + ... delta=0.05 + ... ) + >>> dataset = datasets.Bananas() + >>> for x, y in dataset: + ... hoeffding_race.learn_one(x, y) + ... if hoeffding_race.single_model_remaining(): + ... break + ... + >>> hoeffding_race.remaining_models + {'KNN'} + + """ def __init__(self, models, delta=0.05, metric=metrics.Accuracy()): - # Initialisation des modèles, du delta, de la métrique par défaut et des variables de suivi + self.models = models self.delta = delta - self.metric = metric # Métrique de base (peut être utilisée pour chaque modèle) + self.metric = metric self.n = 0 self.model_metrics = {name: metric.clone() for name in models.keys()} - self.model_performance = {name: 0 for name in models.keys()} # Suivi des performances des modèles - - self.remaining_models = set(models.keys()) # Liste des modèles restant + self.model_performance = {name: 0 for name in models.keys()} + self.remaining_models = set(models.keys()) def hoeffding_bound(self, n): @@ -26,26 +49,17 @@ def learn_one(self, x, y): self.n = self.n+1 for name in list(self.remaining_models): + y_pred = self.models[name].predict_one(x) self.models[name].learn_one(x, y) # Update performance - result = self.model_metrics[name].update(y, y_pred) - + self.model_metrics[name].update(y, y_pred) self.model_performance[name] = self.model_metrics[name].get() - if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: self.remaining_models.remove(name) - print(f"{name} éliminé après {self.n} observations") - - - - # Early stopping if only one model remains - if len(self.remaining_models) == 1: - print(f"{list(self.remaining_models)[0]} est le modèle sélectionné.") - return list(self.remaining_models)[0] - + def predict_one(self, x): @@ -54,13 +68,7 @@ def predict_one(self, x): return self.models[list(self.remaining_models)[0]].predict_one(x) return None # Pas de prédiction tant qu'un modèle n'est pas sélectionné - def _unit_test_params(self): - """Provides default parameters for unit testing.""" - from river.linear_model import LogisticRegression - from river.metrics import Accuracy - return { - "models": {"LogisticRegression": LogisticRegression()}, - "metric": Accuracy(), - "delta": 0.05 - } + def single_model_remaining(self): + return len(self.remaining_models) == 1 + diff --git a/river/model_selection/hoeffding_races_test.py b/river/model_selection/hoeffding_races_test.py deleted file mode 100644 index 005bdf9f88..0000000000 --- a/river/model_selection/hoeffding_races_test.py +++ /dev/null @@ -1,24 +0,0 @@ - -from hoeffding_races import HoeffdingRace -from river.linear_model import LogisticRegression -from river.metrics import Accuracy -from river import linear_model, neighbors, tree, metrics, datasets -from river import naive_bayes - - - -# Instantiate a HoeffdingRace object with a single candidate model -hoeffding_race = HoeffdingRace( - models = {"LogisticRegression": linear_model.LogisticRegression()}, - metric=Accuracy(), - delta=0.05 -) - -dataset = datasets.AirlinePassengers() -print(dataset) -for x, y in dataset: - hoeffding_race.learn_one(x, y) - #print(hoeffding_race.model_metrics["KNN"].get()) - if len(hoeffding_race.remaining_models) == 1: - break -print(hoeffding_race.model_performance) \ No newline at end of file From 66ac5fb85fa987a0dcddeb2ad139f20933f2ff77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Wed, 20 Nov 2024 09:57:44 +0100 Subject: [PATCH 08/24] removed te.py file --- river/active/te.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 river/active/te.py diff --git a/river/active/te.py b/river/active/te.py deleted file mode 100644 index 583e983a4e..0000000000 --- a/river/active/te.py +++ /dev/null @@ -1,5 +0,0 @@ -from river import linear_model - -model = linear_model.LinearRegression() - -print(model) \ No newline at end of file From 89d42c800e8683094ba5374b8d13d9a9bc2ed2b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Wed, 20 Nov 2024 10:01:44 +0100 Subject: [PATCH 09/24] Fixed imports in hoeffding_races.py --- river/model_selection/hoeffding_races.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index a87f2bfef9..416d524465 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -1,10 +1,10 @@ import math from river import metrics, base -from river.base import Classifier -class HoeffdingRace(Classifier): + +class HoeffdingRace(base.Classifier): """ >>> from river import model_selection From 95eb204076e933d68ac8b35bb27563aac462371d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Wed, 20 Nov 2024 10:08:04 +0100 Subject: [PATCH 10/24] Added default value for models --- river/model_selection/hoeffding_races.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index 416d524465..c2b9b55192 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -1,5 +1,5 @@ import math -from river import metrics, base +from river import metrics, base, neighbors @@ -7,6 +7,7 @@ class HoeffdingRace(base.Classifier): """ + >>> from river import model_selection >>> from river.linear_model import LogisticRegression >>> from river.metrics import Accuracy @@ -29,7 +30,7 @@ class HoeffdingRace(base.Classifier): {'KNN'} """ - def __init__(self, models, delta=0.05, metric=metrics.Accuracy()): + def __init__(self, models={"KNN":neighbors.KNNClassifier()}, delta=0.05, metric=metrics.Accuracy()): self.models = models self.delta = delta From 1d880f9cae90dbd7fc3a5505b025a52510b6e775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Wed, 20 Nov 2024 10:40:12 +0100 Subject: [PATCH 11/24] Fixed tests --- river/model_selection/hoeffding_races.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index c2b9b55192..6721c2df03 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -9,18 +9,16 @@ class HoeffdingRace(base.Classifier): >>> from river import model_selection - >>> from river.linear_model import LogisticRegression - >>> from river.metrics import Accuracy >>> from river import linear_model, neighbors, tree, metrics, datasets - >>> from river import naive_bayes + >>> hoeffding_race = model_selection.HoeffdingRace( ... models = { ... "KNN": neighbors.KNNClassifier(), - ... "DecisionTree":tree.HoeffdingAdaptiveTreeRegressor()}, - ... metric=Accuracy(), + ... "Log_Reg":linear_model.LogisticRegression()}, + ... metric=metrics.Accuracy(), ... delta=0.05 ... ) - >>> dataset = datasets.Bananas() + >>> dataset = datasets.Phishing() >>> for x, y in dataset: ... hoeffding_race.learn_one(x, y) ... if hoeffding_race.single_model_remaining(): From 27d8d02748af0a0ebc3180de0f0a4cb0b8d0a47e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Wed, 20 Nov 2024 11:00:01 +0100 Subject: [PATCH 12/24] Separating classifier and regression selection --- river/model_selection/__init__.py | 4 ++-- river/model_selection/hoeffding_races.py | 12 ++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/river/model_selection/__init__.py b/river/model_selection/__init__.py index 9bd628c002..f0d16b81a8 100644 --- a/river/model_selection/__init__.py +++ b/river/model_selection/__init__.py @@ -19,7 +19,7 @@ from .bandit import BanditClassifier, BanditRegressor from .greedy import GreedyRegressor from .sh import SuccessiveHalvingClassifier, SuccessiveHalvingRegressor -from .hoeffding_races import HoeffdingRace +from .hoeffding_races import HoeffdingRace_classifier @@ -30,5 +30,5 @@ "GreedyRegressor", "SuccessiveHalvingClassifier", "SuccessiveHalvingRegressor", - "HoeffdingRace", + "HoeffdingRace_Classifier", ] diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index 6721c2df03..87cdd6fc28 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -1,17 +1,20 @@ import math -from river import metrics, base, neighbors +from river import metrics, base, neighbors,linear_model -class HoeffdingRace(base.Classifier): +class HoeffdingRace_classifier(base.Classifier): """ + + + Tests on Binary Classification >>> from river import model_selection >>> from river import linear_model, neighbors, tree, metrics, datasets - >>> hoeffding_race = model_selection.HoeffdingRace( + >>> hoeffding_race = model_selection.HoeffdingRace_classifier( ... models = { ... "KNN": neighbors.KNNClassifier(), ... "Log_Reg":linear_model.LogisticRegression()}, @@ -28,7 +31,8 @@ class HoeffdingRace(base.Classifier): {'KNN'} """ - def __init__(self, models={"KNN":neighbors.KNNClassifier()}, delta=0.05, metric=metrics.Accuracy()): + def __init__(self, models={"KNN":neighbors.KNNClassifier(), + "Log_Reg":linear_model.LogisticRegression()}, delta=0.05, metric=metrics.Accuracy()): self.models = models self.delta = delta From b69b62edd3c9ef5df8a6f767c356a8d01f3c99e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarwane?= <“marwane.el-hani@imt-atlantique.net”> Date: Wed, 20 Nov 2024 11:07:56 +0100 Subject: [PATCH 13/24] Ajout classe Regressor --- river/model_selection/hoeffding_races.py | 52 +++++++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index 87cdd6fc28..a6fd84429f 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -6,8 +6,6 @@ class HoeffdingRace_classifier(base.Classifier): """ - - Tests on Binary Classification @@ -73,5 +71,55 @@ def predict_one(self, x): def single_model_remaining(self): return len(self.remaining_models) == 1 + + + +class HoeffdingRace_regressor(base.Regressor): + """ + + """ + def __init__(self, models={"KNN":neighbors.KNNRegressor(), + "Log_Reg":linear_model.LinearRegression()}, delta=0.05, metric=metrics.MAE()): + + self.models = models + self.delta = delta + self.metric = metric + self.n = 0 + self.model_metrics = {name: metric.clone() for name in models.keys()} + self.model_performance = {name: 0 for name in models.keys()} + self.remaining_models = set(models.keys()) + + + def hoeffding_bound(self, n): + return math.sqrt((math.log(1 / self.delta)) / (2 * n)) + + def learn_one(self, x, y): + + best_perf = max(self.model_performance.values()) if self.n > 0 else 0 + self.n = self.n+1 + + for name in list(self.remaining_models): + + y_pred = self.models[name].predict_one(x) + self.models[name].learn_one(x, y) + + # Update performance + self.model_metrics[name].update(y, y_pred) + self.model_performance[name] = self.model_metrics[name].get() + + if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: + self.remaining_models.remove(name) + + + + def predict_one(self, x): + # Prediction by best remaining model + if len(self.remaining_models) == 1: + return self.models[list(self.remaining_models)[0]].predict_one(x) + return None # Pas de prédiction tant qu'un modèle n'est pas sélectionné + + def single_model_remaining(self): + return len(self.remaining_models) == 1 + From d2946e7371ba9c6b3abd75b0b9bccd4ff8224391 Mon Sep 17 00:00:00 2001 From: slach31 <158080080+slach31@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:15:22 +0100 Subject: [PATCH 14/24] Delete river/base/Adpredictor_test.ipynb --- river/base/Adpredictor_test.ipynb | 188 ------------------------------ 1 file changed, 188 deletions(-) delete mode 100644 river/base/Adpredictor_test.ipynb diff --git a/river/base/Adpredictor_test.ipynb b/river/base/Adpredictor_test.ipynb deleted file mode 100644 index 5a6b4b4ab9..0000000000 --- a/river/base/Adpredictor_test.ipynb +++ /dev/null @@ -1,188 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 9, - "id": "initial_id", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:34.707796Z", - "start_time": "2024-11-09T15:04:34.663651Z" - }, - "collapsed": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "from __future__ import annotations\n", - "\n", - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "65e3111b76277fc5", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:37.139365Z", - "start_time": "2024-11-09T15:04:37.097733Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "C:\\River\\riverIDLIB\\river\\base\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "print(os.getcwd())" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3ffeadeef731f48e", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:38.751355Z", - "start_time": "2024-11-09T15:04:38.708257Z" - } - }, - "outputs": [], - "source": [ - "from river import compose, datasets, metrics, preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "e323aa048e864b33", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:45.986384Z", - "start_time": "2024-11-09T15:04:45.943819Z" - } - }, - "outputs": [], - "source": [ - "import importlib.util\n", - "\n", - "spec = importlib.util.spec_from_file_location(\"AdPredictor\", \"./AdPredictor.py\")\n", - "AdPredictor_module = importlib.util.module_from_spec(spec)\n", - "spec.loader.exec_module(AdPredictor_module)\n", - "\n", - "AdPredictor = AdPredictor_module.AdPredictor" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "c862e33d656cb230", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:49.810249Z", - "start_time": "2024-11-09T15:04:49.768766Z" - } - }, - "outputs": [], - "source": [ - "phishing_data = datasets.Phishing()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "293c681cca67e1f4", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:54.402788Z", - "start_time": "2024-11-09T15:04:54.358522Z" - } - }, - "outputs": [], - "source": [ - "model = AdPredictor(beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=9)\n", - "metric = metrics.Accuracy()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "c8fb1bb8ed902d80", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:04:59.609037Z", - "start_time": "2024-11-09T15:04:59.569533Z" - } - }, - "outputs": [], - "source": [ - "model_pipeline = compose.Pipeline((\"scale\", preprocessing.StandardScaler()), (\"predictor\", model))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "1dea7c542ab4ad84", - "metadata": { - "ExecuteTime": { - "end_time": "2024-11-09T15:03:47.107046Z", - "start_time": "2024-11-09T15:03:47.054657Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Prediction: 0.5, Metric: None\n" - ] - } - ], - "source": [ - "for x, y in phishing_data:\n", - " y_pred = model_pipeline.predict_one(x)\n", - " metric = metric.update(y, y_pred)\n", - " model_pipeline = model_pipeline.learn_one(x, y)\n", - " print(f\"Prediction: {y_pred}, Metric: {metric}\")\n", - " break" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From a32f48b564126b12a768afa64e841ee4939f4719 Mon Sep 17 00:00:00 2001 From: slach31 <158080080+slach31@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:15:41 +0100 Subject: [PATCH 15/24] Delete river/base/Adpredictor.py --- river/base/Adpredictor.py | 76 --------------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 river/base/Adpredictor.py diff --git a/river/base/Adpredictor.py b/river/base/Adpredictor.py deleted file mode 100644 index 4702760cf8..0000000000 --- a/river/base/Adpredictor.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import annotations - -import logging -from collections import defaultdict, namedtuple - -import numpy as np - -from river import base - -logger = logging.getLogger(__name__) - - -class AdPredictor(base.Classifier): - config = namedtuple("config", ["beta", "prior_probability", "epsilon", "num_features"]) - - def __init__(self, beta=0.1, prior_probability=0.5, epsilon=0.1, num_features=10): - self.beta = beta - self.prior_probability = prior_probability - self.epsilon = epsilon - self.num_features = num_features - self.weights = defaultdict(lambda: {"mean": 0.0, "variance": 1.0}) - self.bias_weight = self.prior_bias_weight() - - def prior_bias_weight(self): - return np.log(self.prior_probability / (1 - self.prior_probability)) / self.beta - - def _active_mean_variance(self, features): - total_mean = sum(self.weights[f]["mean"] for f in features) + self.bias_weight - total_variance = sum(self.weights[f]["variance"] for f in features) + self.beta**2 - return total_mean, total_variance - - def predict_one(self, x): - features = x.keys() - total_mean, total_variance = self._active_mean_variance(features) - return 1 / (1 + np.exp(-total_mean / np.sqrt(total_variance))) - - def learn_one(self, x, y): - features = x.keys() - y = 1 if y else -1 # Map label to ±1 for binary classification - total_mean, total_variance = self._active_mean_variance(features) - v, w = self.gaussian_corrections(y * total_mean / np.sqrt(total_variance)) - - for feature in features: - mean = self.weights[feature]["mean"] - variance = self.weights[feature]["variance"] - - mean_delta = y * variance / np.sqrt(total_variance) * v - variance_multiplier = 1.0 - variance / total_variance * w - - # Update weight - self.weights[feature]["mean"] = mean + mean_delta - self.weights[feature]["variance"] = variance * variance_multiplier - - def gaussian_corrections(self, score): - """Compute Gaussian corrections for Bayesian update.""" - cdf = 1 / (1 + np.exp(-score)) - pdf = np.exp(-0.5 * score**2) / np.sqrt(2 * np.pi) - v = pdf / cdf - w = v * (v + score) - return v, w - - def _apply_dynamics(self, weight): - prior_variance = 1.0 - adjusted_variance = ( - weight["variance"] - * prior_variance - / ((1.0 - self.epsilon) * prior_variance + self.epsilon * weight["variance"]) - ) - adjusted_mean = adjusted_variance * ( - (1.0 - self.epsilon) * weight["mean"] / weight["variance"] - + self.epsilon * 0 / prior_variance - ) - return {"mean": adjusted_mean, "variance": adjusted_variance} - - def __str__(self): - return "AdPredictor" From 0e7d9d8d3ffcec273e856abe8a23df1243cc62fc Mon Sep 17 00:00:00 2001 From: slach31 <158080080+slach31@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:16:45 +0100 Subject: [PATCH 16/24] Delete river/linear_model/rls.py --- river/linear_model/rls.py | 47 --------------------------------------- 1 file changed, 47 deletions(-) delete mode 100644 river/linear_model/rls.py diff --git a/river/linear_model/rls.py b/river/linear_model/rls.py deleted file mode 100644 index 9aa69410e1..0000000000 --- a/river/linear_model/rls.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np - - -class RLS(object): - - def __init__(self, p: int, l: float, delta: float): - self.p = p # Filter order - self.l = l # Forgetting factor - self.delta = delta # Value to initialise P(0) - - self.currentStep = 0 - - self.x = np.zeros((p + 1, 1)) # Column vector - self.P = np.identity(p + 1) * self.delta - - self.estimates = [] - self.estimates.append(np.zeros((p + 1, 1))) # Weight vector initialized to zeros - - self.Pks = [] - self.Pks.append(self.P) - - def estimate(self, xn: float, dn: float): - # Update input vector - self.x = np.roll(self.x, -1) - self.x[-1, 0] = xn - - # Get previous weight vector - wn_prev = self.estimates[-1] - - # Compute gain vector - denominator = self.l + self.x.T @ self.Pks[-1] @ self.x - gn = (self.Pks[-1] @ self.x) / denominator - - # Compute a priori error - alpha = dn - (self.x.T @ wn_prev) - - # Update inverse correlation matrix - Pn = (self.Pks[-1] - gn @ self.x.T @ self.Pks[-1]) / self.l - self.Pks.append(Pn) - - # Update weight vector - wn = wn_prev + gn * alpha - self.estimates.append(wn) - - self.currentStep += 1 - - return wn From d6695ac99294b287ff3c2ab57720196b41b3a10f Mon Sep 17 00:00:00 2001 From: slach31 <158080080+slach31@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:17:03 +0100 Subject: [PATCH 17/24] Update __init__.py --- river/linear_model/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/river/linear_model/__init__.py b/river/linear_model/__init__.py index e74d0d1439..756720490a 100644 --- a/river/linear_model/__init__.py +++ b/river/linear_model/__init__.py @@ -10,7 +10,6 @@ from .pa import PAClassifier, PARegressor from .perceptron import Perceptron from .softmax import SoftmaxRegression -from .rls import RLS __all__ = [ "base", @@ -22,5 +21,4 @@ "PARegressor", "Perceptron", "SoftmaxRegression", - "RLS", ] From 3dbcdd0c93e27c3e6e03d7a71e231e02568205c6 Mon Sep 17 00:00:00 2001 From: slach31 <158080080+slach31@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:18:08 +0100 Subject: [PATCH 18/24] Update hoeffding_races.py --- river/model_selection/hoeffding_races.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index a6fd84429f..dc659430b1 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -4,7 +4,7 @@ -class HoeffdingRace_classifier(base.Classifier): +class HoeffdingRaceClassifier(base.Classifier): """ Tests on Binary Classification @@ -12,7 +12,7 @@ class HoeffdingRace_classifier(base.Classifier): >>> from river import model_selection >>> from river import linear_model, neighbors, tree, metrics, datasets - >>> hoeffding_race = model_selection.HoeffdingRace_classifier( + >>> hoeffding_race = model_selection.HoeffdingRaceClassifier( ... models = { ... "KNN": neighbors.KNNClassifier(), ... "Log_Reg":linear_model.LogisticRegression()}, @@ -74,7 +74,7 @@ def single_model_remaining(self): -class HoeffdingRace_regressor(base.Regressor): +class HoeffdingRaceRegressor(base.Regressor): """ From 39ebc9f95ccc39711520477be9f65373856cbe9f Mon Sep 17 00:00:00 2001 From: slach31 <158080080+slach31@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:18:55 +0100 Subject: [PATCH 19/24] Update __init__.py --- river/model_selection/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/river/model_selection/__init__.py b/river/model_selection/__init__.py index f0d16b81a8..0ce9956a7d 100644 --- a/river/model_selection/__init__.py +++ b/river/model_selection/__init__.py @@ -19,7 +19,7 @@ from .bandit import BanditClassifier, BanditRegressor from .greedy import GreedyRegressor from .sh import SuccessiveHalvingClassifier, SuccessiveHalvingRegressor -from .hoeffding_races import HoeffdingRace_classifier +from .hoeffding_races import HoeffdingRaceClassifier @@ -30,5 +30,5 @@ "GreedyRegressor", "SuccessiveHalvingClassifier", "SuccessiveHalvingRegressor", - "HoeffdingRace_Classifier", + "HoeffdingRaceClassifier", ] From ecbaa3520d12600052d3bb13a251a988c716aae4 Mon Sep 17 00:00:00 2001 From: W0lfgunbl00d Date: Wed, 20 Nov 2024 11:21:32 +0100 Subject: [PATCH 20/24] test precommit --- river/model_selection/__init__.py | 4 +- river/model_selection/hoeffding_races.py | 77 +++++++++++------------- 2 files changed, 35 insertions(+), 46 deletions(-) diff --git a/river/model_selection/__init__.py b/river/model_selection/__init__.py index 0ce9956a7d..9413852b48 100644 --- a/river/model_selection/__init__.py +++ b/river/model_selection/__init__.py @@ -18,10 +18,8 @@ from . import base from .bandit import BanditClassifier, BanditRegressor from .greedy import GreedyRegressor -from .sh import SuccessiveHalvingClassifier, SuccessiveHalvingRegressor from .hoeffding_races import HoeffdingRaceClassifier - - +from .sh import SuccessiveHalvingClassifier, SuccessiveHalvingRegressor __all__ = [ "base", diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index dc659430b1..5b37e28b25 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -1,12 +1,12 @@ -import math -from river import metrics, base, neighbors,linear_model +from __future__ import annotations +import math +from river import base, linear_model, metrics, neighbors class HoeffdingRaceClassifier(base.Classifier): """ - Tests on Binary Classification >>> from river import model_selection @@ -27,99 +27,90 @@ class HoeffdingRaceClassifier(base.Classifier): ... >>> hoeffding_race.remaining_models {'KNN'} - """ - def __init__(self, models={"KNN":neighbors.KNNClassifier(), - "Log_Reg":linear_model.LogisticRegression()}, delta=0.05, metric=metrics.Accuracy()): - + + def __init__( + self, + models={"KNN": neighbors.KNNClassifier(), "Log_Reg": linear_model.LogisticRegression()}, + delta=0.05, + metric=metrics.Accuracy(), + ): self.models = models self.delta = delta - self.metric = metric + self.metric = metric self.n = 0 self.model_metrics = {name: metric.clone() for name in models.keys()} - self.model_performance = {name: 0 for name in models.keys()} - self.remaining_models = set(models.keys()) - + self.model_performance = {name: 0 for name in models.keys()} + self.remaining_models = set(models.keys()) def hoeffding_bound(self, n): return math.sqrt((math.log(1 / self.delta)) / (2 * n)) def learn_one(self, x, y): - best_perf = max(self.model_performance.values()) if self.n > 0 else 0 - self.n = self.n+1 + self.n = self.n + 1 for name in list(self.remaining_models): - y_pred = self.models[name].predict_one(x) self.models[name].learn_one(x, y) - + # Update performance self.model_metrics[name].update(y, y_pred) - self.model_performance[name] = self.model_metrics[name].get() + self.model_performance[name] = self.model_metrics[name].get() - if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: + if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: self.remaining_models.remove(name) - - def predict_one(self, x): # Prediction by best remaining model if len(self.remaining_models) == 1: return self.models[list(self.remaining_models)[0]].predict_one(x) return None # Pas de prédiction tant qu'un modèle n'est pas sélectionné - + def single_model_remaining(self): return len(self.remaining_models) == 1 - class HoeffdingRaceRegressor(base.Regressor): - """ - - - """ - def __init__(self, models={"KNN":neighbors.KNNRegressor(), - "Log_Reg":linear_model.LinearRegression()}, delta=0.05, metric=metrics.MAE()): - + """ """ + + def __init__( + self, + models={"KNN": neighbors.KNNRegressor(), "Log_Reg": linear_model.LinearRegression()}, + delta=0.05, + metric=metrics.MAE(), + ): self.models = models self.delta = delta - self.metric = metric + self.metric = metric self.n = 0 self.model_metrics = {name: metric.clone() for name in models.keys()} - self.model_performance = {name: 0 for name in models.keys()} - self.remaining_models = set(models.keys()) - + self.model_performance = {name: 0 for name in models.keys()} + self.remaining_models = set(models.keys()) def hoeffding_bound(self, n): return math.sqrt((math.log(1 / self.delta)) / (2 * n)) def learn_one(self, x, y): - best_perf = max(self.model_performance.values()) if self.n > 0 else 0 - self.n = self.n+1 + self.n = self.n + 1 for name in list(self.remaining_models): - y_pred = self.models[name].predict_one(x) self.models[name].learn_one(x, y) - + # Update performance self.model_metrics[name].update(y, y_pred) - self.model_performance[name] = self.model_metrics[name].get() + self.model_performance[name] = self.model_metrics[name].get() - if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: + if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: self.remaining_models.remove(name) - - def predict_one(self, x): # Prediction by best remaining model if len(self.remaining_models) == 1: return self.models[list(self.remaining_models)[0]].predict_one(x) return None # Pas de prédiction tant qu'un modèle n'est pas sélectionné - + def single_model_remaining(self): return len(self.remaining_models) == 1 - - From 432363d2e508dab48fab569d30450bfc8f940bfb Mon Sep 17 00:00:00 2001 From: EL HANI Marwane Date: Wed, 27 Nov 2024 15:59:03 +0100 Subject: [PATCH 21/24] Added commentary + fixed bugs --- river/model_selection/__init__.py | 2 + river/model_selection/hoeffding_races.py | 72 +++++++++++++++++++++--- 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/river/model_selection/__init__.py b/river/model_selection/__init__.py index 9413852b48..c25b6dd5db 100644 --- a/river/model_selection/__init__.py +++ b/river/model_selection/__init__.py @@ -19,6 +19,7 @@ from .bandit import BanditClassifier, BanditRegressor from .greedy import GreedyRegressor from .hoeffding_races import HoeffdingRaceClassifier +from .hoeffding_races import HoeffdingRaceRegressor from .sh import SuccessiveHalvingClassifier, SuccessiveHalvingRegressor __all__ = [ @@ -29,4 +30,5 @@ "SuccessiveHalvingClassifier", "SuccessiveHalvingRegressor", "HoeffdingRaceClassifier", + "HoeffdingRaceRegressor" ] diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index 5b37e28b25..f747e166d4 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -7,6 +7,21 @@ class HoeffdingRaceClassifier(base.Classifier): """ + HoeffdingRace-based model selection for Classification. + + Each models is associated to a performance (here its accuracy). When the model is considered too inaccurate by the hoeffding bound, + the model is removed. + + Parameters + ---------- + models + The models to select from. + metric + The metric that is used to measure the performance of each model. + delta + Hoeffding bound precision. + + Tests on Binary Classification >>> from river import model_selection @@ -41,9 +56,13 @@ def __init__( self.n = 0 self.model_metrics = {name: metric.clone() for name in models.keys()} self.model_performance = {name: 0 for name in models.keys()} - self.remaining_models = set(models.keys()) + self.remaining_models = [i for i in models.keys()] def hoeffding_bound(self, n): + """ + Computes the hoeffding bound according to n, the number of iterations done. + + """ return math.sqrt((math.log(1 / self.delta)) / (2 * n)) def learn_one(self, x, y): @@ -55,24 +74,58 @@ def learn_one(self, x, y): self.models[name].learn_one(x, y) # Update performance + self.model_metrics[name].update(y, y_pred) self.model_performance[name] = self.model_metrics[name].get() if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: self.remaining_models.remove(name) + if len(self.remaining_models) == 1: + break def predict_one(self, x): - # Prediction by best remaining model + if len(self.remaining_models) == 1: return self.models[list(self.remaining_models)[0]].predict_one(x) - return None # Pas de prédiction tant qu'un modèle n'est pas sélectionné + return None def single_model_remaining(self): return len(self.remaining_models) == 1 class HoeffdingRaceRegressor(base.Regressor): - """ """ + """ + HoeffdingRace-based model selection for regression. + + Each models is associated to a performance (here its accuracy). When the model is considered too inaccurate by the hoeffding bound, + the model is removed. + + Parameters + ---------- + models + The models to select from. + metric + The metric that is used to measure the performance of each model. + delta + Hoeffding bound precision. + + Tests on Regression models + >>> from river import linear_model, neighbors, tree, metrics, datasets, model_selection + >>> hoeffding_race = model_selection.HoeffdingRaceRegressor( + ... models = {"KNN": neighbors.KNNRegressor(), + ... "Log_Reg":linear_model.LinearRegression()}, + ... metric=metrics.MAE(), + ... delta=0.05) + >>> dataset = datasets.ChickWeights() + >>> for x, y in dataset: + ... hoeffding_race.learn_one(x, y) + ... if hoeffding_race.single_model_remaining(): + ... break + ... +>>> print(hoeffding_race.remaining_models) +['Log_Reg'] + + """ def __init__( self, @@ -86,7 +139,7 @@ def __init__( self.n = 0 self.model_metrics = {name: metric.clone() for name in models.keys()} self.model_performance = {name: 0 for name in models.keys()} - self.remaining_models = set(models.keys()) + self.remaining_models = [i for i in models.keys()] def hoeffding_bound(self, n): return math.sqrt((math.log(1 / self.delta)) / (2 * n)) @@ -100,17 +153,22 @@ def learn_one(self, x, y): self.models[name].learn_one(x, y) # Update performance + self.model_metrics[name].update(y, y_pred) self.model_performance[name] = self.model_metrics[name].get() if self.model_performance[name] + self.hoeffding_bound(self.n) < best_perf: self.remaining_models.remove(name) + if len(self.remaining_models) == 1: + break def predict_one(self, x): - # Prediction by best remaining model if len(self.remaining_models) == 1: return self.models[list(self.remaining_models)[0]].predict_one(x) - return None # Pas de prédiction tant qu'un modèle n'est pas sélectionné + return None def single_model_remaining(self): + """ + Method to be able to know if the "race" has ended. + """ return len(self.remaining_models) == 1 From 1850eff7b032da6f1c4acce1879821a7e37b7b51 Mon Sep 17 00:00:00 2001 From: EL HANI Marwane Date: Wed, 27 Nov 2024 16:00:32 +0100 Subject: [PATCH 22/24] Fixed typo --- river/model_selection/hoeffding_races.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index f747e166d4..3f92fd44ad 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -41,7 +41,7 @@ class HoeffdingRaceClassifier(base.Classifier): ... break ... >>> hoeffding_race.remaining_models - {'KNN'} + ['KNN'] """ def __init__( @@ -122,8 +122,8 @@ class HoeffdingRaceRegressor(base.Regressor): ... if hoeffding_race.single_model_remaining(): ... break ... ->>> print(hoeffding_race.remaining_models) -['Log_Reg'] + >>> print(hoeffding_race.remaining_models) + ['Log_Reg'] """ From 286660b85609fcdf79fb2b5a2dd88501f8b4a75b Mon Sep 17 00:00:00 2001 From: EL HANI Marwane Date: Thu, 28 Nov 2024 16:23:10 +0100 Subject: [PATCH 23/24] Fixed issues pre-commit --- river/model_selection/__init__.py | 5 ++--- river/model_selection/hoeffding_races.py | 9 ++++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/river/model_selection/__init__.py b/river/model_selection/__init__.py index c25b6dd5db..a1f9a9f3d7 100644 --- a/river/model_selection/__init__.py +++ b/river/model_selection/__init__.py @@ -18,8 +18,7 @@ from . import base from .bandit import BanditClassifier, BanditRegressor from .greedy import GreedyRegressor -from .hoeffding_races import HoeffdingRaceClassifier -from .hoeffding_races import HoeffdingRaceRegressor +from .hoeffding_races import HoeffdingRaceClassifier, HoeffdingRaceRegressor from .sh import SuccessiveHalvingClassifier, SuccessiveHalvingRegressor __all__ = [ @@ -30,5 +29,5 @@ "SuccessiveHalvingClassifier", "SuccessiveHalvingRegressor", "HoeffdingRaceClassifier", - "HoeffdingRaceRegressor" + "HoeffdingRaceRegressor", ] diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index 3f92fd44ad..25d2f3cd3c 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -84,17 +84,16 @@ def learn_one(self, x, y): break def predict_one(self, x): - if len(self.remaining_models) == 1: return self.models[list(self.remaining_models)[0]].predict_one(x) - return None + return None def single_model_remaining(self): return len(self.remaining_models) == 1 class HoeffdingRaceRegressor(base.Regressor): - """ + """ HoeffdingRace-based model selection for regression. Each models is associated to a performance (here its accuracy). When the model is considered too inaccurate by the hoeffding bound, @@ -124,7 +123,7 @@ class HoeffdingRaceRegressor(base.Regressor): ... >>> print(hoeffding_race.remaining_models) ['Log_Reg'] - + """ def __init__( @@ -165,7 +164,7 @@ def learn_one(self, x, y): def predict_one(self, x): if len(self.remaining_models) == 1: return self.models[list(self.remaining_models)[0]].predict_one(x) - return None + return None def single_model_remaining(self): """ From 6bc1833e8b3903b13c53cecdc1324c8e117bff5a Mon Sep 17 00:00:00 2001 From: s23lachg Date: Sat, 30 Nov 2024 21:06:41 +0100 Subject: [PATCH 24/24] fix errors pytest hoeffding_races --- river/model_selection/hoeffding_races.py | 32 +++++++++++++++--------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/river/model_selection/hoeffding_races.py b/river/model_selection/hoeffding_races.py index 25d2f3cd3c..721be4de18 100644 --- a/river/model_selection/hoeffding_races.py +++ b/river/model_selection/hoeffding_races.py @@ -44,12 +44,16 @@ class HoeffdingRaceClassifier(base.Classifier): ['KNN'] """ - def __init__( - self, - models={"KNN": neighbors.KNNClassifier(), "Log_Reg": linear_model.LogisticRegression()}, - delta=0.05, - metric=metrics.Accuracy(), - ): + def __init__(self, models=None, delta=0.05, metric=None): + if models is None: + models = { + "KNN": neighbors.KNNClassifier(), + "Log_Reg": linear_model.LogisticRegression(), + } + if metric is None: + metric = metrics.Accuracy() + + # Assign to instance variables self.models = models self.delta = delta self.metric = metric @@ -126,12 +130,16 @@ class HoeffdingRaceRegressor(base.Regressor): """ - def __init__( - self, - models={"KNN": neighbors.KNNRegressor(), "Log_Reg": linear_model.LinearRegression()}, - delta=0.05, - metric=metrics.MAE(), - ): + def __init__(self, models=None, delta=0.05, metric=None): + if models is None: + models = { + "KNN": neighbors.KNNRegressor(), + "Log_Reg": linear_model.LinearRegression(), + } + if metric is None: + metric = metrics.MAE() + + # Assign to instance variables self.models = models self.delta = delta self.metric = metric