diff --git a/dddex/__pycache__/__init__.cpython-38.pyc b/dddex/__pycache__/__init__.cpython-38.pyc index 40199ec..b59f17a 100644 Binary files a/dddex/__pycache__/__init__.cpython-38.pyc and b/dddex/__pycache__/__init__.cpython-38.pyc differ diff --git a/dddex/__pycache__/_modidx.cpython-38.pyc b/dddex/__pycache__/_modidx.cpython-38.pyc index 81c80db..dd9c6ad 100644 Binary files a/dddex/__pycache__/_modidx.cpython-38.pyc and b/dddex/__pycache__/_modidx.cpython-38.pyc differ diff --git a/dddex/__pycache__/baseClasses.cpython-38.pyc b/dddex/__pycache__/baseClasses.cpython-38.pyc index 3c20227..7cf24e6 100644 Binary files a/dddex/__pycache__/baseClasses.cpython-38.pyc and b/dddex/__pycache__/baseClasses.cpython-38.pyc differ diff --git a/dddex/__pycache__/crossValidation.cpython-38.pyc b/dddex/__pycache__/crossValidation.cpython-38.pyc index 1eeaf20..46e674e 100644 Binary files a/dddex/__pycache__/crossValidation.cpython-38.pyc and b/dddex/__pycache__/crossValidation.cpython-38.pyc differ diff --git a/dddex/__pycache__/levelSetKDEx_multivariate.cpython-38.pyc b/dddex/__pycache__/levelSetKDEx_multivariate.cpython-38.pyc index c38408f..59b1ba0 100644 Binary files a/dddex/__pycache__/levelSetKDEx_multivariate.cpython-38.pyc and b/dddex/__pycache__/levelSetKDEx_multivariate.cpython-38.pyc differ diff --git a/dddex/__pycache__/levelSetKDEx_univariate.cpython-38.pyc b/dddex/__pycache__/levelSetKDEx_univariate.cpython-38.pyc index 1ae72c3..7b35cb8 100644 Binary files a/dddex/__pycache__/levelSetKDEx_univariate.cpython-38.pyc and b/dddex/__pycache__/levelSetKDEx_univariate.cpython-38.pyc differ diff --git a/dddex/__pycache__/loadData.cpython-38.pyc b/dddex/__pycache__/loadData.cpython-38.pyc index b5993b4..bc62e33 100644 Binary files a/dddex/__pycache__/loadData.cpython-38.pyc and b/dddex/__pycache__/loadData.cpython-38.pyc differ diff --git a/dddex/__pycache__/utils.cpython-38.pyc b/dddex/__pycache__/utils.cpython-38.pyc index 8aa3116..087df71 100644 Binary files a/dddex/__pycache__/utils.cpython-38.pyc and b/dddex/__pycache__/utils.cpython-38.pyc differ diff --git a/dddex/__pycache__/wSAA.cpython-38.pyc b/dddex/__pycache__/wSAA.cpython-38.pyc index d79e581..6b2fc05 100644 Binary files a/dddex/__pycache__/wSAA.cpython-38.pyc and b/dddex/__pycache__/wSAA.cpython-38.pyc differ diff --git a/dddex/levelSetKDEx_univariate.py b/dddex/levelSetKDEx_univariate.py index 0085f93..d5902c6 100644 --- a/dddex/levelSetKDEx_univariate.py +++ b/dddex/levelSetKDEx_univariate.py @@ -408,7 +408,7 @@ def generateBins(binSize: int, # Size of the bins of values of `yPred` being gro for i in range(len(yPred)): if i == 0: - lowerBoundPerBin[binIndex] = np.NINF + lowerBoundPerBin[binIndex] = -np.inf currentBinSize += 1 trainIndicesLeft -= 1 diff --git a/nbs/.ipynb_checkpoints/01_levelSetKDEx_univariate-checkpoint.ipynb b/nbs/.ipynb_checkpoints/01_levelSetKDEx_univariate-checkpoint.ipynb index 4471577..1a504d3 100644 --- a/nbs/.ipynb_checkpoints/01_levelSetKDEx_univariate-checkpoint.ipynb +++ b/nbs/.ipynb_checkpoints/01_levelSetKDEx_univariate-checkpoint.ipynb @@ -646,7 +646,7 @@ " for i in range(len(yPred)):\n", " \n", " if i == 0:\n", - " lowerBoundPerBin[binIndex] = np.NINF\n", + " lowerBoundPerBin[binIndex] = -np.inf\n", " \n", " currentBinSize += 1\n", " trainIndicesLeft -= 1\n", @@ -735,128 +735,128 @@ "metadata": {}, "outputs": [], "source": [ - "#| export\n", - "\n", - "from drf import drf \n", - "\n", - "class LevelSetKDEx_DRF(BaseWeightsBasedEstimator, BaseLSx):\n", - " \"\"\"\n", - " `LevelSetKDEx` turns any point forecasting model into an estimator of the underlying conditional density.\n", - " The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts\n", - " as a similarity measure between samples. The point forecasts of the training samples are sorted and \n", - " recursively assigned to a bin until the size of the current bin reaches `binSize` many samples. Then\n", - " a new bin is created and so on. For a new test sample we check into which bin its point prediction\n", - " would have fallen and interpret the training samples of that bin as the empirical distribution function\n", - " of this test sample. \n", - " \"\"\"\n", - " \n", - " def __init__(self, \n", - " estimator, # Model with a .fit and .predict-method (implementing the scikit-learn estimator interface).\n", - " binSize: int=100, # Size of the bins created while running fit.\n", - " ):\n", - " \n", - " super(BaseEstimator, self).__init__(estimator = estimator)\n", - "\n", - " # Check if binSize is integer\n", - " if not isinstance(binSize, (int, np.int32, np.int64)):\n", - " raise ValueError(\"'binSize' must be an integer!\")\n", - "\n", - " self.binSize = binSize\n", - " \n", - " self.yTrain = None\n", - " self.yPredTrain = None\n", - " self.drf = None\n", - " self.fitted = False\n", - " \n", - " #---\n", - " \n", - " def fit(self: LevelSetKDEx_DRF, \n", - " X: np.ndarray, # Feature matrix used by `estimator` to predict `y`.\n", - " y: np.ndarray, # 1-dimensional target variable corresponding to the feature matrix `X`.\n", - " ):\n", - " \"\"\"\n", - " Fit `LevelSetKDEx` model by grouping the point predictions of the samples specified via `X`\n", - " according to their value. Samples are recursively sorted into bins until each bin contains\n", - " `binSize` many samples. For details, checkout the function `generateBins` which does the\n", - " heavy lifting.\n", - " \"\"\"\n", - " \n", - " # Checks\n", - " if not isinstance(self.binSize, (int, np.int32, np.int64)):\n", - " raise ValueError(\"'binSize' must be an integer!\")\n", + "# #| export\n", + "\n", + "# from drf import drf \n", + "\n", + "# class LevelSetKDEx_DRF(BaseWeightsBasedEstimator, BaseLSx):\n", + "# \"\"\"\n", + "# `LevelSetKDEx` turns any point forecasting model into an estimator of the underlying conditional density.\n", + "# The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts\n", + "# as a similarity measure between samples. The point forecasts of the training samples are sorted and \n", + "# recursively assigned to a bin until the size of the current bin reaches `binSize` many samples. Then\n", + "# a new bin is created and so on. For a new test sample we check into which bin its point prediction\n", + "# would have fallen and interpret the training samples of that bin as the empirical distribution function\n", + "# of this test sample. \n", + "# \"\"\"\n", + " \n", + "# def __init__(self, \n", + "# estimator, # Model with a .fit and .predict-method (implementing the scikit-learn estimator interface).\n", + "# binSize: int=100, # Size of the bins created while running fit.\n", + "# ):\n", + " \n", + "# super(BaseEstimator, self).__init__(estimator = estimator)\n", + "\n", + "# # Check if binSize is integer\n", + "# if not isinstance(binSize, (int, np.int32, np.int64)):\n", + "# raise ValueError(\"'binSize' must be an integer!\")\n", + "\n", + "# self.binSize = binSize\n", + " \n", + "# self.yTrain = None\n", + "# self.yPredTrain = None\n", + "# self.drf = None\n", + "# self.fitted = False\n", + " \n", + "# #---\n", + " \n", + "# def fit(self: LevelSetKDEx_DRF, \n", + "# X: np.ndarray, # Feature matrix used by `estimator` to predict `y`.\n", + "# y: np.ndarray, # 1-dimensional target variable corresponding to the feature matrix `X`.\n", + "# ):\n", + "# \"\"\"\n", + "# Fit `LevelSetKDEx` model by grouping the point predictions of the samples specified via `X`\n", + "# according to their value. Samples are recursively sorted into bins until each bin contains\n", + "# `binSize` many samples. For details, checkout the function `generateBins` which does the\n", + "# heavy lifting.\n", + "# \"\"\"\n", + " \n", + "# # Checks\n", + "# if not isinstance(self.binSize, (int, np.int32, np.int64)):\n", + "# raise ValueError(\"'binSize' must be an integer!\")\n", " \n", - " if self.binSize > y.shape[0]:\n", - " raise ValueError(\"'binSize' mustn't be bigger than the size of 'y'!\")\n", + "# if self.binSize > y.shape[0]:\n", + "# raise ValueError(\"'binSize' mustn't be bigger than the size of 'y'!\")\n", " \n", - " if X.shape[0] != y.shape[0]:\n", - " raise ValueError(\"'X' and 'y' must contain the same number of samples!\")\n", + "# if X.shape[0] != y.shape[0]:\n", + "# raise ValueError(\"'X' and 'y' must contain the same number of samples!\")\n", " \n", - " #---\n", + "# #---\n", " \n", - " try:\n", - " yPred = self.estimator.predict(X)\n", + "# try:\n", + "# yPred = self.estimator.predict(X)\n", " \n", - " except NotFittedError:\n", - " try:\n", - " self.estimator.fit(X = X, y = y) \n", - " except:\n", - " raise ValueError(\"Couldn't fit 'estimator' with user specified 'X' and 'y'!\")\n", - " else:\n", - " yPred = self.estimator.predict(X)\n", + "# except NotFittedError:\n", + "# try:\n", + "# self.estimator.fit(X = X, y = y) \n", + "# except:\n", + "# raise ValueError(\"Couldn't fit 'estimator' with user specified 'X' and 'y'!\")\n", + "# else:\n", + "# yPred = self.estimator.predict(X)\n", " \n", - " #---\n", + "# #---\n", " \n", - " yPred = pd.DataFrame(yPred)\n", - " y = pd.Series(y)\n", + "# yPred = pd.DataFrame(yPred)\n", + "# y = pd.Series(y)\n", "\n", - " DRF = drf(min_node_size = self.binSize, num_trees = 100, num_features = 1, honesty = False, sample_fraction = 0.5, response_scaling = False, mtry = 1, num_threads = 16)\n", - " DRF.fit(X = yPred, Y = y)\n", + "# DRF = drf(min_node_size = self.binSize, num_trees = 100, num_features = 1, honesty = False, sample_fraction = 0.5, response_scaling = False, mtry = 1, num_threads = 16)\n", + "# DRF.fit(X = yPred, Y = y)\n", " \n", - " #---\n", + "# #---\n", " \n", - " # IMPORTANT: In case 'y' is given as a pandas.Series, we can potentially run into indexing \n", - " # problems later on.\n", - " self.yTrain = y.ravel()\n", + "# # IMPORTANT: In case 'y' is given as a pandas.Series, we can potentially run into indexing \n", + "# # problems later on.\n", + "# self.yTrain = y.ravel()\n", " \n", - " self.yPredTrain = yPred\n", - " self.drf = DRF\n", - " self.fitted = True\n", + "# self.yPredTrain = yPred\n", + "# self.drf = DRF\n", + "# self.fitted = True\n", " \n", - " #---\n", + "# #---\n", " \n", - " def getWeights(self: LevelSetKDEx_DRF, \n", - " X: np.ndarray, # Feature matrix for which conditional density estimates are computed.\n", - " # Specifies structure of the returned density estimates. One of: \n", - " # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'\n", - " outputType: str='onlyPositiveWeights', \n", - " # Optional. List with length X.shape[0]. Values are multiplied to the estimated \n", - " # density of each sample for scaling purposes.\n", - " scalingList: list=None, \n", - " ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.\n", + "# def getWeights(self: LevelSetKDEx_DRF, \n", + "# X: np.ndarray, # Feature matrix for which conditional density estimates are computed.\n", + "# # Specifies structure of the returned density estimates. One of: \n", + "# # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'\n", + "# outputType: str='onlyPositiveWeights', \n", + "# # Optional. List with length X.shape[0]. Values are multiplied to the estimated \n", + "# # density of each sample for scaling purposes.\n", + "# scalingList: list=None, \n", + "# ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.\n", " \n", - " # __annotations__ = BaseWeightsBasedEstimator.getWeights.__annotations__\n", - " __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__\n", + "# # __annotations__ = BaseWeightsBasedEstimator.getWeights.__annotations__\n", + "# __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__\n", " \n", - " if not self.fitted:\n", - " raise NotFittedError(\"This LevelSetKDEx instance is not fitted yet. Call 'fit' with \"\n", - " \"appropriate arguments before trying to compute weights.\")\n", + "# if not self.fitted:\n", + "# raise NotFittedError(\"This LevelSetKDEx instance is not fitted yet. Call 'fit' with \"\n", + "# \"appropriate arguments before trying to compute weights.\")\n", " \n", - " #---\n", + "# #---\n", " \n", - " yPred = self.estimator.predict(X)\n", - " yPred = pd.DataFrame(yPred)\n", + "# yPred = self.estimator.predict(X)\n", + "# yPred = pd.DataFrame(yPred)\n", " \n", - " weightsArray = self.drf.predict(yPred).weights\n", - " weightsList = list(weightsArray)\n", - " weightsDataList = [(weights[weights > 0], np.where(weights > 0)[0]) for weights in weightsList]\n", + "# weightsArray = self.drf.predict(yPred).weights\n", + "# weightsList = list(weightsArray)\n", + "# weightsDataList = [(weights[weights > 0], np.where(weights > 0)[0]) for weights in weightsList]\n", "\n", - " weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, \n", - " outputType = outputType, \n", - " y = self.yTrain,\n", - " scalingList = scalingList,\n", - " equalWeights = True)\n", + "# weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, \n", + "# outputType = outputType, \n", + "# y = self.yTrain,\n", + "# scalingList = scalingList,\n", + "# equalWeights = True)\n", " \n", - " return weightsDataList\n", + "# return weightsDataList\n", " \n", " " ] @@ -2842,9 +2842,9 @@ ], "metadata": { "kernelspec": { - "display_name": "dddex39", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "dddex39" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -2856,7 +2856,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/nbs/.ipynb_checkpoints/03_wSAA-checkpoint.ipynb b/nbs/.ipynb_checkpoints/03_wSAA-checkpoint.ipynb index aa777aa..7337bd8 100644 --- a/nbs/.ipynb_checkpoints/03_wSAA-checkpoint.ipynb +++ b/nbs/.ipynb_checkpoints/03_wSAA-checkpoint.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "6a9df2e7-fdc8-44e7-b16c-d8b15ed01db9", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "8c687b2c-941e-4e2a-9855-aa227ccb8490", "metadata": {}, "outputs": [], @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "6a1faac0-5c0d-4c70-80e9-7d959811b1b7", "metadata": {}, "outputs": [], @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "b86f92e6-67ad-492b-9779-7b9acfc3d3df", "metadata": {}, "outputs": [], @@ -68,13 +68,14 @@ "import pandas as pd\n", "import numpy as np\n", "import copy\n", + "from collections import defaultdict\n", "\n", "from sklearn.ensemble import RandomForestRegressor\n", "from lightgbm import LGBMRegressor\n", "from sklearn.base import MetaEstimatorMixin\n", "from lightgbm.sklearn import LGBMModel\n", "from dddex.baseClasses import BaseWeightsBasedEstimator\n", - "from dddex.utils import restructureWeightsDataList" + "from dddex.utils import restructureWeightsDataList, restructureWeightsDataList_multivariate" ] }, { @@ -87,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "36b775c4-ca57-4539-a0b8-e282bdf963cf", "metadata": {}, "outputs": [], @@ -147,11 +148,28 @@ "\n", " #---\n", "\n", - " weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, \n", - " outputType = outputType, \n", - " y = self.yTrain, \n", - " scalingList = scalingList,\n", - " equalWeights = False)\n", + " # Check if self.yTrain is a 2D array with more than one column.\n", + " if len(self.yTrain.shape) > 1:\n", + " if self.yTrain.shape[1] > 1:\n", + "\n", + " if not outputType in ['all', 'onlyPositiveWeights', 'summarized']:\n", + " raise ValueError(\"outputType must be one of 'all', 'onlyPositiveWeights', 'summarized' for multivariate y.\")\n", + " \n", + " weightsDataList = restructureWeightsDataList_multivariate(weightsDataList = weightsDataList, \n", + " outputType = outputType, \n", + " y = self.yTrain, \n", + " scalingList = scalingList,\n", + " equalWeights = False) \n", + " \n", + " else:\n", + " weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, \n", + " outputType = outputType, \n", + " y = self.yTrain, \n", + " scalingList = scalingList,\n", + " equalWeights = False)\n", + " \n", + " \n", + " \n", "\n", " return weightsDataList\n", " \n", @@ -184,33 +202,152 @@ ] }, { - "cell_type": "code", - "execution_count": 6, - "id": "7db18d0f-bdb7-4f08-a8cf-ac07a49e7d2c", - "metadata": {}, - "outputs": [], - "source": [ - "# show_doc(RandomForestWSAA)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "2a240e3b-5acc-4f82-8f4d-29391f5692b7", + "cell_type": "markdown", + "id": "8cfff03a", "metadata": {}, - "outputs": [], "source": [ - "# show_doc(RandomForestWSAA.fit)" + "## wSAA - Random Forest2" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "84ebe067-f51e-4038-ae00-06bafa7ca014", + "execution_count": null, + "id": "cfcc0602", "metadata": {}, "outputs": [], "source": [ - "# show_doc(RandomForestWSAA.getWeights)" + "#| export \n", + "\n", + "# We attempt here to speed up the computation of the weights by interpreting every single\n", + "# tree as a lookup table. This way we don't have to compare the leaf-Indices arrays of each\n", + "# training sample and each test sample.\n", + "# Unfortunately, despite the fact that this strategy works very well for a single tree,\n", + "# it doesn't work for the whole forest because the structure of the output of the lookup \n", + "# tables per tree makes it difficult to aggregate the received weights per tree \n", + "# over all trees.\n", + "\n", + "class RandomForestWSAA2(RandomForestRegressor, BaseWeightsBasedEstimator):\n", + " \n", + " def fit(self, \n", + " X: np.ndarray, # Feature matrix\n", + " y: np.ndarray, # Target values\n", + " **kwargs):\n", + "\n", + " super().fit(X = X, \n", + " y = y, \n", + " **kwargs)\n", + " \n", + " self.yTrain = y\n", + " \n", + " leafIndices = self.apply(X)\n", + "\n", + " indicesPerBinPerTree = list()\n", + "\n", + " for indexTree in range(self.n_estimators):\n", + " leafIndicesPerTree = leafIndices[:, indexTree]\n", + "\n", + " indicesPerBin = defaultdict(list)\n", + "\n", + " for index, leafIndex in enumerate(leafIndicesPerTree):\n", + " indicesPerBin[leafIndex].append(index)\n", + "\n", + " indicesPerBinPerTree.append(indicesPerBin)\n", + " \n", + " self.indicesPerBinPerTree = indicesPerBinPerTree\n", + "\n", + " \n", + " \n", + " #---\n", + " \n", + " def getWeights(self, \n", + " X: np.ndarray, # Feature matrix for which conditional density estimates are computed.\n", + " # Specifies structure of the returned density estimates. One of: \n", + " # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'\n", + " outputType: str='onlyPositiveWeights', \n", + " # Optional. List with length X.shape[0]. Values are multiplied to the estimated \n", + " # density of each sample for scaling purposes.\n", + " scalingList: list=None, \n", + " ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.\n", + " \n", + " __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__\n", + " \n", + " #---\n", + " \n", + " leafIndicesPerTree = self.apply(X)\n", + " \n", + " weightsDataList = list()\n", + "\n", + " for leafIndices in leafIndicesPerTree:\n", + " \n", + " weights = np.zeros(self.yTrain.shape[0])\n", + "\n", + " for indexTree in range(len(leafIndices)):\n", + " indicesPosWeight = self.indicesPerBinPerTree[indexTree][leafIndices[indexTree]]\n", + "\n", + " weightsNew = np.zeros(self.yTrain.shape[0])\n", + " np.put(weightsNew, indicesPosWeight, 1 / len(indicesPosWeight))\n", + " \n", + " weights = weights + weightsNew\n", + "\n", + " weights = weights / len(leafIndices)\n", + "\n", + " weightsPosIndex = np.where(weights > 0)[0]\n", + "\n", + " weightsDataList.append((weights[weightsPosIndex], weightsPosIndex))\n", + "\n", + " #---\n", + "\n", + " # Check if self.yTrain is a 2D array with more than one column.\n", + " if len(self.yTrain.shape) > 1:\n", + " if self.yTrain.shape[1] > 1:\n", + "\n", + " if not outputType in ['all', 'onlyPositiveWeights', 'summarized']:\n", + " raise ValueError(\"outputType must be one of 'all', 'onlyPositiveWeights', 'summarized' for multivariate y.\")\n", + " \n", + " weightsDataList = restructureWeightsDataList_multivariate(weightsDataList = weightsDataList, \n", + " outputType = outputType, \n", + " y = self.yTrain, \n", + " scalingList = scalingList,\n", + " equalWeights = False) \n", + " \n", + " else:\n", + " weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, \n", + " outputType = outputType, \n", + " y = self.yTrain, \n", + " scalingList = scalingList,\n", + " equalWeights = False)\n", + " \n", + " \n", + " \n", + "\n", + " return weightsDataList\n", + " \n", + " #---\n", + " \n", + " def predict(self : BaseWeightsBasedEstimator, \n", + " X: np.ndarray, # Feature matrix for which conditional quantiles are computed.\n", + " probs: list, # Probabilities for which quantiles are computed.\n", + " outputAsDf: bool=True, # Determines output. Either a dataframe with probs as columns or a dict with probs as keys.\n", + " # Optional. List with length X.shape[0]. Values are multiplied to the predictions\n", + " # of each sample to rescale values.\n", + " scalingList: list=None, \n", + " ): \n", + " \n", + " __doc__ = BaseWeightsBasedEstimator.predict.__doc__\n", + " \n", + " return super(MetaEstimatorMixin, self).predict(X = X,\n", + " probs = probs, \n", + " scalingList = scalingList)\n", + " \n", + " #---\n", + " \n", + " def pointPredict(self,\n", + " X: np.ndarray, # Feature Matrix\n", + " **kwargs):\n", + " \"\"\"Original `predict` method to generate point forecasts\"\"\"\n", + " \n", + " return super().predict(X = X,\n", + " **kwargs)\n" ] }, { @@ -223,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "da815ec8", "metadata": {}, "outputs": [], @@ -329,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "952b21e7-4c15-4c6d-8ebd-236740ec71e5", "metadata": {}, "outputs": [], @@ -458,90 +595,203 @@ { "cell_type": "code", "execution_count": null, - "id": "e6774387-0636-452e-842e-bc998a834395", + "id": "9e79762a-1427-49d7-b8b2-42ecde48e16c", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#| hide\n", + "import nbdev; nbdev.nbdev_export()" + ] + }, + { + "cell_type": "markdown", + "id": "01f57a0f-e4d4-4655-858c-d6319f087936", + "metadata": {}, + "source": [ + "# Test Code" + ] }, { "cell_type": "code", - "execution_count": 11, - "id": "6dc67de5-0fcb-47b3-82dd-908cbedaae31", + "execution_count": null, + "id": "c5f055de-89ce-4943-8242-8f434ee8a3f1", "metadata": {}, "outputs": [], "source": [ - "# show_doc(SampleAverageApproximation)" + "# #| hide\n", + "\n", + "# from lightgbm import LGBMRegressor\n", + "# import lightgbm as lgb\n", + "# from dddex.loadData import *\n", + "# from datasetsDynamic.loadDataYaz import loadDataYaz\n", + "# import ipdb\n", + "# import inspect\n", + "# from sklearn.base import RegressorMixin\n", + "# from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "\n", + "# data, XTrain, yTrain, XTest, yTest = loadDataBakery()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b50a254", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(72300, 227)\n", + "(72300, 7)\n" + ] + } + ], + "source": [ + "# #| hide\n", + "\n", + "# data, XTrain, yTrain, XTest, yTest = loadDataYaz(testDays = 14,\n", + "# daysToCut = 0,\n", + "# normalizeDemand = True,\n", + "# unstacked = True,\n", + "# returnXY = True)\n", + "\n", + "# # RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1, max_depth = 3)\n", + "# # RF.fit(X = XTrain, y = yTrain)\n", + "\n", + "# # Duplicate XTrain and yTrain m times\n", + "# m = 100\n", + "# XTrain = np.vstack([XTrain for i in range(m)])\n", + "# yTrain = np.vstack([yTrain for i in range(m)])\n", + "\n", + "# print(XTrain.shape)\n", + "# print(yTrain.shape)\n", + "\n", + "# # Add gaussian to XTrain and yTrain\n", + "# XTrain = XTrain + np.random.normal(0, 0.1, XTrain.shape)\n", + "# yTrain = yTrain + np.random.normal(0, 0.1, yTrain.shape)" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "a9520933-6480-4e96-bace-8b9f164cd922", + "execution_count": null, + "id": "a8e0a068", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 51.7 s, sys: 0 ns, total: 51.7 s\n", + "Wall time: 51.7 s\n" + ] + } + ], "source": [ - "# show_doc(SampleAverageApproximation.fit)" + "# %%time\n", + "# RFWSAA = RandomForestWSAA(n_estimators = 10, n_jobs = 1, max_depth = 4)\n", + "# RFWSAA.fit(X = XTrain, y = yTrain)" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "c5e77e0d-5cb8-4c29-b356-0f0e61bccde0", + "execution_count": null, + "id": "e7a01f60", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 51.7 s, sys: 0 ns, total: 51.7 s\n", + "Wall time: 51.7 s\n" + ] + } + ], "source": [ - "# show_doc(SampleAverageApproximation.getWeights)" + "# %%time\n", + "# RFWSAA2 = RandomForestWSAA2(n_estimators = 10, max_depth = 4, n_jobs = 1)\n", + "# RFWSAA2.fit(X = XTrain, y = yTrain)" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "9e79762a-1427-49d7-b8b2-42ecde48e16c", + "execution_count": null, + "id": "d4515e12", "metadata": {}, "outputs": [], "source": [ - "#| hide\n", - "import nbdev; nbdev.nbdev_export()" + "# n = 10000" ] }, { "cell_type": "code", "execution_count": null, - "id": "7c5ccab9-da3c-4bb1-ad35-9cfcdade92bd", + "id": "59e3c0fe", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 15.4 s, sys: 2.92 s, total: 18.3 s\n", + "Wall time: 18.3 s\n" + ] + } + ], + "source": [ + "# %%time\n", + "# weights = RFWSAA.getWeights(X = XTrain[:n])" + ] }, { - "cell_type": "markdown", - "id": "01f57a0f-e4d4-4655-858c-d6319f087936", + "cell_type": "code", + "execution_count": null, + "id": "d58be9c9", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1min 20s, sys: 1.44 s, total: 1min 21s\n", + "Wall time: 1min 21s\n" + ] + } + ], "source": [ - "# Test Code" + "# %%time\n", + "# weights2 = RFWSAA2.getWeights(X = XTrain[:n])" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "c5f055de-89ce-4943-8242-8f434ee8a3f1", + "execution_count": null, + "id": "fdbb9dfd", "metadata": {}, "outputs": [], "source": [ - "# #| hide\n", + "# RF.apply(XTrain).shape\n", "\n", - "# from lightgbm import LGBMRegressor\n", - "# import lightgbm as lgb\n", - "# from dddex.loadData import *\n", - "# import ipdb\n", - "# import inspect\n", - "# from sklearn.base import RegressorMixin\n", + "# indicesPerBinPerTree = list()\n", "\n", - "# data, XTrain, yTrain, XTest, yTest = loadDataBakery()" + "# for indexTree, tree in enumerate(RF.estimators_):\n", + "# leafIndicesTrain = tree.apply(XTrain)\n", + "\n", + "# indicesPerBin = defaultdict(list)\n", + "\n", + "# for index, leafIndex in enumerate(leafIndicesTrain):\n", + "# indicesPerBin[leafIndex].append(index)\n", + "\n", + "# indicesPerBinPerTree.append(indicesPerBin)\n", + " \n", + " \n" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "bbcd2880-1b9e-4d07-ab84-4ba8a2bef226", "metadata": {}, "outputs": [], @@ -559,9 +809,9 @@ ], "metadata": { "kernelspec": { - "display_name": "dddex", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "dddex" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -573,7 +823,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.15" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/nbs/01_levelSetKDEx_univariate.ipynb b/nbs/01_levelSetKDEx_univariate.ipynb index cfcfc64..29d264d 100644 --- a/nbs/01_levelSetKDEx_univariate.ipynb +++ b/nbs/01_levelSetKDEx_univariate.ipynb @@ -646,7 +646,7 @@ " for i in range(len(yPred)):\n", " \n", " if i == 0:\n", - " lowerBoundPerBin[binIndex] = np.NINF\n", + " lowerBoundPerBin[binIndex] = -np.inf\n", " \n", " currentBinSize += 1\n", " trainIndicesLeft -= 1\n",