Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] GLM with multiple distributions and link function support #384

Merged
merged 21 commits into from
Jun 21, 2024
Merged
Changes from 12 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
eaa3aad
[ENH] test_methods_p handling shuffle
ShreeshaM07 Jun 11, 2024
c822d3d
Merge branch 'sktime:main' into tmp
ShreeshaM07 Jun 12, 2024
2906e11
[ENH] GLM with multiple family and link support
ShreeshaM07 Jun 12, 2024
2975628
[ENH] GLMs with multiple link and distribution support
ShreeshaM07 Jun 12, 2024
c5802a3
[ENH] GLMs with multiple distributions and links
ShreeshaM07 Jun 12, 2024
38385d8
default "Normal"
ShreeshaM07 Jun 12, 2024
888d67a
modified gamma params
ShreeshaM07 Jun 13, 2024
f6ff39e
link function support
ShreeshaM07 Jun 13, 2024
40c9986
Merge branch 'sktime:main' into glm
ShreeshaM07 Jun 13, 2024
4bdd71d
offset and exposure added as bool and part of X
ShreeshaM07 Jun 16, 2024
e6b20d1
offset and exposure initialized in constructor itself with size
ShreeshaM07 Jun 16, 2024
8945ebd
Back to no `offset`/`exposure`
ShreeshaM07 Jun 16, 2024
064517c
Revert "offset and exposure initialized in constructor itself with size"
ShreeshaM07 Jun 17, 2024
e79cdd4
offset_var and exposure_var implemented
ShreeshaM07 Jun 17, 2024
53ec0d1
params order chaged for deprecation handling
ShreeshaM07 Jun 17, 2024
f0e9bcc
added test_glm for offset and exposure
ShreeshaM07 Jun 18, 2024
a1523ea
deprecation for changing sequence of params
ShreeshaM07 Jun 19, 2024
a75b097
removed sktime dependecy in warning
ShreeshaM07 Jun 19, 2024
9ee6e94
modified requested changes
ShreeshaM07 Jun 20, 2024
2a1044d
order unchanged parameters back to default.
ShreeshaM07 Jun 20, 2024
3a0a621
init modified
ShreeshaM07 Jun 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 123 additions & 18 deletions skpro/regression/linear/_glm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Interface adapter for the Generalized Linear Model Regressor with Gaussian Link."""
"""Interface adapter for the Generalized Linear Model Regressor."""
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)

__author__ = ["ShreeshaM07", "julian-fong"]

import pandas as pd

from skpro.regression.base import BaseProbaRegressor
Expand All @@ -18,6 +20,16 @@ class GLMRegressor(BaseProbaRegressor):

Parameters
----------
family : str
Available options are
1.Normal
2.Poisson
ShreeshaM07 marked this conversation as resolved.
Show resolved Hide resolved
3.Gamma
link : str
Available safe options are
ShreeshaM07 marked this conversation as resolved.
Show resolved Hide resolved
Normal : Log, Identity, InversePower
Poisson : Log, Identity, Sqrt
Gamma : Log, Identity, InversePower
missing : str
Available options are 'none', 'drop' and 'raise'. If 'none', no nan
checking is done. If 'drop', any observations with nans are dropped.
Expand Down Expand Up @@ -157,8 +169,8 @@ class GLMRegressor(BaseProbaRegressor):
"""

_tags = {
"authors": ["julian-fong"],
"maintainers": ["julian-fong"],
"authors": ["ShreeshaM07", "julian-fong"],
"maintainers": ["ShreeshaM07", "julian-fong"],
"python_version": None,
"python_dependencies": "statsmodels",
"capability:multioutput": False,
Expand All @@ -167,8 +179,44 @@ class GLMRegressor(BaseProbaRegressor):
"y_inner_mtype": "pd_DataFrame_Table",
}

def _str_to_sm_family(self, family, link):
"""Convert the string to a statsmodel object.

If the link function is also explcitly mentioned then include then
that must be passed to the family/distribution object.
"""
from warnings import warn

from statsmodels.genmod.families.family import Gamma, Gaussian, Poisson
from statsmodels.genmod.families.links import Identity, InversePower, Log, Sqrt

sm_fmly = {
"Normal": Gaussian,
"Poisson": Poisson,
"Gamma": Gamma,
}

links = {
"Log": Log,
"Identity": Identity,
"InversePower": InversePower,
"Sqrt": Sqrt,
}

if link in links:
link_function = links[link]()
try:
return sm_fmly[family](link_function)
except Exception:
msg = "Invalid link for family, default link will be used"
warn(msg)

return sm_fmly[family]()

def __init__(
self,
family="Normal",
link=None,
Copy link
Collaborator

@fkiraly fkiraly Jun 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would have been great to have these as the first params from the get-go, but right now we cannot add these at the start of the parameter list due to the deprecation policy - it would make user code break.

We have to add the new params at the end, and I would suggest following deprecation policy to move them to the start eventually, see https://www.sktime.net/en/latest/developer_guide/deprecation.html

missing="none",
start_params=None,
maxiter=100,
Expand All @@ -184,9 +232,11 @@ def __init__(
add_constant=False,
):
super().__init__()
from statsmodels.genmod.families.family import Gaussian

self._family = Gaussian()
if family is None:
family = "Normal"
self.family = family
self.link = link
self.missing = missing
self.start_params = start_params
self.maxiter = maxiter
Expand Down Expand Up @@ -231,10 +281,14 @@ def _fit(self, X, y):

y_col = y.columns

family = self.family
link = self.link
sm_family = self._str_to_sm_family(family=family, link=link)

glm_estimator = GLM(
endog=y,
exog=X_,
family=self._family,
family=sm_family,
missing=self.missing,
)

Expand Down Expand Up @@ -313,6 +367,47 @@ def _predict(self, X):

return y_pred

def _params_sm_to_skpro(self, y_predictions_df, index, columns, family):
"""Convert the statsmodels output to equivalent skpro distribution."""
from skpro.distributions.gamma import Gamma
from skpro.distributions.normal import Normal
from skpro.distributions.poisson import Poisson

skpro_distr = {
"Normal": Normal,
"Poisson": Poisson,
"Gamma": Gamma,
}

params = {}
skp_dist = Normal

if family in skpro_distr:
skp_dist = skpro_distr[family]

if skp_dist == Normal:
y_mu = y_predictions_df["mean"].rename("mu").to_frame()
y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame()
params["mu"] = y_mu
params["sigma"] = y_sigma
elif skp_dist == Poisson:
y_mu = y_predictions_df["mean"].rename("mu").to_frame()
params["mu"] = y_mu
elif skp_dist == Gamma:
y_mean = y_predictions_df["mean"]
y_sd = y_predictions_df["mean_se"]
y_alpha = (y_mean / y_sd) ** 2
y_beta = (y_alpha / y_mean).rename("beta").to_frame()
ShreeshaM07 marked this conversation as resolved.
Show resolved Hide resolved
y_alpha = y_alpha.rename("alpha").to_frame()
ShreeshaM07 marked this conversation as resolved.
Show resolved Hide resolved
params["alpha"] = y_alpha
params["beta"] = y_beta

params["index"] = index
params["columns"] = columns

y_pred = skp_dist(**params)
return y_pred

def _predict_proba(self, X):
"""Predict distribution over labels for data from features.

Expand All @@ -332,24 +427,20 @@ def _predict_proba(self, X):
y_pred : skpro BaseDistribution, same length as `X`
labels predicted for `X`
"""
from skpro.distributions.normal import Normal

X_ = self._prep_x(X)

# instead of using the conventional predict() method, we use statsmodels
# get_prediction method, which returns a pandas df that contains
# the prediction and prediction variance i.e mu and sigma
y_column = self.y_col
y_predictions_df = self.glm_fit_.get_prediction(X_).summary_frame()
y_mu = y_predictions_df["mean"].rename("mu").to_frame()
y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame()
params = {
"mu": y_mu,
"sigma": y_sigma,
"index": X_.index,
"columns": y_column,
}
y_pred = Normal(**params)

# convert the returned values to skpro equivalent distribution
family = self.family
index = X_.index
columns = y_column

y_pred = self._params_sm_to_skpro(y_predictions_df, index, columns, family)
return y_pred

def _prep_x(self, X):
Expand Down Expand Up @@ -395,5 +486,19 @@ def get_test_params(cls, parameter_set="default"):
"""
params1 = {}
params2 = {"add_constant": True}
params3 = {
"family": "Poisson",
"add_constant": True,
}
params4 = {"family": "Gamma"}
params5 = {
"family": "Normal",
"link": "InversePower",
}
params6 = {
"family": "Poisson",
"link": "Log",
"add_constant": True,
}

return [params1, params2]
return [params1, params2, params3, params4, params5, params6]
Loading