Skip to content

Commit

Permalink
Input models
Browse files Browse the repository at this point in the history
  • Loading branch information
JrtPec committed Dec 8, 2023
1 parent 7bc09cf commit 65bf5f4
Show file tree
Hide file tree
Showing 8 changed files with 5,547 additions and 57 deletions.
1,869 changes: 1,869 additions & 0 deletions data/mvlr/sample_gas.json

Large diffs are not rendered by default.

2,586 changes: 2,586 additions & 0 deletions data/mvlr/sample_solar.json

Large diffs are not rendered by default.

923 changes: 923 additions & 0 deletions demo_mvlr.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion openenergyid/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Open Energy ID Python SDK."""

__version__ = "0.1.6"
__version__ = "0.1.7"

from .enums import Granularity
from .models import TimeSeries
Expand Down
17 changes: 12 additions & 5 deletions openenergyid/mvlr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
"""Multi-variable linear regression (MVLR) module."""

from .mvlr import MultiVariableLinearRegression, find_best_mvlr, ValidationParameters
from .models import IndependentVariable, MultiVariableRegressionResult
from .main import find_best_mvlr
from .models import (
IndependentVariableInput,
MultiVariableRegressionInput,
MultiVariableRegressionResult,
ValidationParameters,
IndependentVariableResult,
)

__all__ = [
"MultiVariableLinearRegression",
"MultiVariableRegressionResult",
"IndependentVariable",
"find_best_mvlr",
"IndependentVariableInput",
"MultiVariableRegressionInput",
"MultiVariableRegressionResult",
"ValidationParameters",
"IndependentVariableResult",
]
28 changes: 28 additions & 0 deletions openenergyid/mvlr/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Main module for the MultiVariableLinearRegression class."""

from .models import MultiVariableRegressionInput, MultiVariableRegressionResult
from .helpers import resample_input_data
from .mvlr import MultiVariableLinearRegression


def find_best_mvlr(
data: MultiVariableRegressionInput,
) -> MultiVariableRegressionResult:
"""Cycle through multiple granularities and return the best model."""
for granularity in data.granularities:
frame = data.data_frame()
frame = resample_input_data(data=frame, granularity=granularity)
mvlr = MultiVariableLinearRegression(
data=frame,
y=data.dependent_variable,
granularity=granularity,
allow_negative_predictions=data.allow_negative_predictions,
)
mvlr.do_analysis()
if mvlr.validate(
min_rsquared=data.validation_parameters.rsquared,
max_f_pvalue=data.validation_parameters.f_pvalue,
max_pvalues=data.validation_parameters.pvalues,
):
return MultiVariableRegressionResult.from_mvlr(mvlr)
raise ValueError("No valid model found.")
127 changes: 120 additions & 7 deletions openenergyid/mvlr/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Models for multivariable linear regression."""
from typing import Optional
from typing import Any, List, Optional
import pandas as pd

from pydantic import BaseModel, Field, ConfigDict
import statsmodels.formula.api as fm
Expand All @@ -10,6 +11,118 @@
from .mvlr import MultiVariableLinearRegression


COLUMN_TEMPERATUREEQUIVALENT = "temperatureEquivalent"


######################
# MVLR Input Models #
######################


class ValidationParameters(BaseModel):
"""Parameters for validation of a multivariable linear regression model."""

rsquared: float = Field(
0.75, ge=0, le=1, description="Minimum acceptable value for the adjusted R-squared"
)
f_pvalue: float = Field(
0.05, ge=0, le=1, description="Maximum acceptable value for the F-statistic"
)
pvalues: float = Field(
0.05, ge=0, le=1, description="Maximum acceptable value for the p-values of the t-statistic"
)


class IndependentVariableInput(BaseModel):
"""
Independent variable.
Has to corresponds to a column in the data frame.
"""

name: str = Field(
description="Name of the independent variable. "
"If the name is `temperatureEquivalent`, "
"it will be unpacked into columns according to the variants."
)
variants: Optional[list[str]] = Field(
default=None,
description="Variants of the `temperatureEquivalent` independent variable. "
"Eg. `HDD_16.5` will be Heating Degree Days with a base temperature of 16.5°C, "
"`CDD_0` will be Cooling Degree Days with a base temperature of 0°C.",
)


class MultiVariableRegressionInput(BaseModel):
"""Multi-variable regression input."""

timezone: str = Field(alias="timeZone")
independent_variables: List[IndependentVariableInput] = Field(
alias="independentVariables", min_length=1
)
dependent_variable: str = Field(alias="dependentVariable")
frame: TimeSeries
granularities: list[Granularity]
allow_negative_predictions: bool = Field(alias="allowNegativePredictions", default=False)
validation_parameters: ValidationParameters = Field(
alias="validationParameters", default=ValidationParameters()
)

def model_post_init(self, __context: Any) -> None:
"""Post init hook."""
# Check if all independent variables are present in the data frame
for iv in self.independent_variables: # pylint: disable=not-an-iterable
if iv.name not in self.frame.columns:
raise ValueError(f"Independent variable {iv.name} not found in the data frame.")

return super().model_post_init(__context)

def _data_frame(self) -> pd.DataFrame:
"""Convert the data to a pandas DataFrame."""
return self.frame.to_pandas(timezone=self.timezone)

def data_frame(self) -> pd.DataFrame:
"""
Return the data frame ready for analysis.
Unpacks degree days and removes unnecessary columns.
If an independent variable named `temperatureEquivalent` is present,
it will be unpacked into columns according to the variants.
Eg. Variant "HDD_16.5" will be Heating Degree Days
with a base temperature of 16.5°C,
"CDD_0" will be Cooling Degree Days with a base temperature of 0°C.
"""
frame = self._data_frame()
columns_to_retain = [self.dependent_variable]
for iv in self.independent_variables: # pylint: disable=not-an-iterable
if iv.name == COLUMN_TEMPERATUREEQUIVALENT and iv.variants is not None:
for variant in iv.variants:
prefix, base_temperature = variant.split("_")
if prefix == "CDD":
frame[variant] = frame[COLUMN_TEMPERATUREEQUIVALENT] - float(
base_temperature
)
else:
frame[variant] = (
float(base_temperature) - frame[COLUMN_TEMPERATUREEQUIVALENT]
)
frame[variant] = frame[variant].clip(lower=0)
columns_to_retain.append(variant)
frame.drop(columns=[COLUMN_TEMPERATUREEQUIVALENT], inplace=True)
else:
columns_to_retain.append(iv.name)

frame = frame[columns_to_retain].copy()

return frame


######################
# MVLR Result Models #
######################


class ConfidenceInterval(BaseModel):
"""Confidence interval for a coefficient."""

Expand All @@ -18,7 +131,7 @@ class ConfidenceInterval(BaseModel):
upper: float


class IndependentVariable(BaseModel):
class IndependentVariableResult(BaseModel):
"""Independent variable for a multivariable linear regression model."""

name: str
Expand All @@ -33,7 +146,7 @@ class IndependentVariable(BaseModel):
model_config = ConfigDict(populate_by_name=True)

@classmethod
def from_fit(cls, fit: fm.ols, name: str) -> "IndependentVariable":
def from_fit(cls, fit: fm.ols, name: str) -> "IndependentVariableResult":
"""Create an IndependentVariable from a fit."""
return cls(
name=name,
Expand All @@ -53,12 +166,12 @@ class MultiVariableRegressionResult(BaseModel):
"""Result of a multivariable regression model."""

dependent_variable: str = Field(alias="dependentVariable")
independent_variables: list[IndependentVariable] = Field(alias="independentVariables")
independent_variables: list[IndependentVariableResult] = Field(alias="independentVariables")
r2: float = Field(ge=0, le=1, alias="rSquared")
r2_adj: float = Field(ge=0, le=1, alias="rSquaredAdjusted")
f_stat: float = Field(ge=0, alias="fStat")
prob_f_stat: float = Field(ge=0, le=1, alias="probFStat")
intercept: IndependentVariable
intercept: IndependentVariableResult
granularity: Granularity
frame: TimeSeries

Expand All @@ -73,7 +186,7 @@ def from_mvlr(cls, mvlr: MultiVariableLinearRegression) -> "MultiVariableRegress
param_keys.remove("Intercept")
independent_variables = []
for k in param_keys:
independent_variables.append(IndependentVariable.from_fit(mvlr.fit, k))
independent_variables.append(IndependentVariableResult.from_fit(mvlr.fit, k))

# Create resulting TimeSeries
cols_to_keep = list(param_keys)
Expand All @@ -88,7 +201,7 @@ def from_mvlr(cls, mvlr: MultiVariableLinearRegression) -> "MultiVariableRegress
r2_adj=mvlr.fit.rsquared_adj,
f_stat=mvlr.fit.fvalue,
prob_f_stat=mvlr.fit.f_pvalue,
intercept=IndependentVariable.from_fit(mvlr.fit, "Intercept"),
intercept=IndependentVariableResult.from_fit(mvlr.fit, "Intercept"),
granularity=mvlr.granularity,
frame=TimeSeries.from_pandas(frame),
)
52 changes: 8 additions & 44 deletions openenergyid/mvlr/mvlr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,12 @@

import numpy as np
import pandas as pd
from pydantic import BaseModel, Field
import statsmodels.formula.api as fm
from patsy import LookupFactor, ModelDesc, Term # pylint: disable=no-name-in-module
from statsmodels.sandbox.regression.predstd import wls_prediction_std

from openenergyid.enums import Granularity

from .helpers import resample_input_data


class ValidationParameters(BaseModel):
"""Parameters for validation of a multivariable linear regression model."""

rsquared: float = Field(
0.75, ge=0, le=1, description="Minimum acceptable value for the adjusted R-squared"
)
f_pvalue: float = Field(
0.05, ge=0, le=1, description="Maximum acceptable value for the F-statistic"
)
pvalues: float = Field(
0.05, ge=0, le=1, description="Maximum acceptable value for the p-values of the t-statistic"
)


class MultiVariableLinearRegression:
"""Multi-variable linear regression.
Expand Down Expand Up @@ -56,7 +39,6 @@ def __init__(
confint: float = 0.95,
cross_validation: bool = False,
allow_negative_predictions: bool = False,
validation_params: ValidationParameters = None,
granularity: Granularity = None,
):
"""Parameters
Expand All @@ -80,8 +62,6 @@ def __init__(
If True, allow predictions to be negative.
For gas consumption or PV production, this is not physical
so allow_negative_predictions should be False
validation_params : ValidationParameters, default=None
Parameters to validate the model.
"""
self.data = data.copy()
if y not in self.data.columns:
Expand All @@ -95,7 +75,6 @@ def __init__(
self.confint = confint
self.cross_validation = cross_validation
self.allow_negative_predictions = allow_negative_predictions
self.validation_params = validation_params or ValidationParameters()
self.granularity = granularity
self._fit = None
self._list_of_fits = []
Expand Down Expand Up @@ -299,7 +278,7 @@ def remove_from_model_desc(x: str, model_desc: ModelDesc) -> ModelDesc:
pars_to_prune = fit.pvalues.where(fit.pvalues > p_max).dropna().index.tolist()
try:
pars_to_prune.remove("Intercept")
except KeyError:
except ValueError:
pass
while pars_to_prune:
corrected_model_desc = remove_from_model_desc(
Expand All @@ -310,7 +289,7 @@ def remove_from_model_desc(x: str, model_desc: ModelDesc) -> ModelDesc:
pars_to_prune = fit.pvalues.where(fit.pvalues > p_max).dropna().index.tolist()
try:
pars_to_prune.remove("Intercept")
except KeyError:
except ValueError:
pass
return fit

Expand Down Expand Up @@ -400,40 +379,25 @@ def add_prediction(self):
"""
self.data = self._predict(fit=self.fit, data=self.data)

@property
def is_valid(self) -> bool:
def validate(
self, min_rsquared: float = 0.75, max_f_pvalue: float = 0.05, max_pvalues: float = 0.05
) -> bool:
"""Checks if the model is valid.
Returns
-------
bool: True if the model is valid, False otherwise.
"""
if self.fit.rsquared_adj < self.validation_params.rsquared:
if self.fit.rsquared_adj < min_rsquared:
return False

if self.fit.f_pvalue > self.validation_params.f_pvalue:
if self.fit.f_pvalue > max_f_pvalue:
return False

param_keys = self.fit.pvalues.keys().tolist()
param_keys.remove("Intercept")
for k in param_keys:
if self.fit.pvalues[k] > self.validation_params.pvalues:
if self.fit.pvalues[k] > max_pvalues:
return False

return True


def find_best_mvlr(
data: pd.DataFrame,
y: str,
granularities: list[Granularity],
**kwargs,
) -> MultiVariableLinearRegression:
"""Cycle through multiple granularities and return the best model."""
for granularity in granularities:
data = resample_input_data(data=data, granularity=granularity)
mvlr = MultiVariableLinearRegression(data=data, y=y, granularity=granularity, **kwargs)
mvlr.do_analysis()
if mvlr.is_valid:
return mvlr
raise ValueError("No valid model found.")

0 comments on commit 65bf5f4

Please sign in to comment.