diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py index e536f6e47..1a7a07d08 100644 --- a/skpro/regression/cyclic_boosting.py +++ b/skpro/regression/cyclic_boosting.py @@ -83,11 +83,10 @@ class CyclicBoosting(BaseProbaRegressor): be on a bounded interval, with support between ``lower`` and ``upper``. maximal_iterations : int, default=10 maximum number of iterations for the cyclic boosting algorithm - dist_type: str, one of ``'normal'`` (default), ``'logistic'`` - inner base distribution to use for the Johnson QPD, i.e., before - arcosh and similar transformations. - Available options are ``'normal'`` (default), ``'logistic'``, - or ``'sinhlogistic'``. + dist: str, default='normal', + One of ``'normal'`` or ``'logistic'``inner base distribution to use for + the Johnson QPD, i.e., before arcosh and similar transformations. Available + options are ``'normal'`` (default), ``'logistic'`` or ``'sinhlogistic'``. Attributes ---------- @@ -133,6 +132,11 @@ class CyclicBoosting(BaseProbaRegressor): "tests:vm": True, # requires its own test VM to run } + # TODO (release 2.14.0) + # remove the 'dist_type' argument from '__init__' signature + # remove the following 'if' check and deprecation warning + # de-indent the following 'else' check + def __init__( self, feature_groups: Union[List[str], List[Tuple[str, ...]], None] = None, @@ -142,7 +146,8 @@ def __init__( lower: Union[float, None] = None, upper: Union[float, None] = None, maximal_iterations=10, - dist_type: Union[str, None] = "normal", + dist_type: Union[str, None] = "deprecated", + dist: Union[str, None] = "normal", dist_shape: Union[float, None] = 0.0, ): self.feature_groups = feature_groups @@ -153,10 +158,28 @@ def __init__( self.upper = upper self.maximal_iterations = maximal_iterations self.dist_type = dist_type + self.dist = dist self.dist_shape = dist_shape super().__init__() + # handle deprecation of dist_type -> dist + if dist_type != "deprecated": + from warnings import warn + + warn( + "in `CyclicBoosting`, parameter 'dist_type' " + "will be renamed to 'dist' in version 2.14.0. " + "To keep current behaviour and to silence this warning, " + "use 'dist' instead of 'dist_type', " + "set dist explicitly via kwarg, and do not set dist_type.", + category=DeprecationWarning, + stacklevel=2, + ) + self._dist = dist_type + else: + self._dist = dist + self.quantiles = [self.alpha, 0.5, 1 - self.alpha] self.quantile_values = list() self.quantile_est = list() @@ -315,7 +338,7 @@ def _predict_proba(self, X): "qv_high": self.quantile_values[2].reshape(-1, 1), "lower": self.lower, "upper": self.upper, - "base_dist": self.dist_type, + "base_dist": self._dist, "index": index, "columns": y_cols, } diff --git a/skpro/regression/gam/_gam.py b/skpro/regression/gam/_gam.py index 21a80003d..a4ec1a1eb 100644 --- a/skpro/regression/gam/_gam.py +++ b/skpro/regression/gam/_gam.py @@ -30,7 +30,7 @@ class GAMRegressor(BaseProbaRegressor): By default a univariate spline term will be allocated for each feature. Can be a ``pygam`` terms expression for custom model specification. - distribution : str or pygam.Distribution, optional (default='Normal') + dist : str or pygam.Distribution, optional (default='Normal') Distribution family to use in the model. Supported strings (case-insensitive): @@ -81,17 +81,17 @@ class GAMRegressor(BaseProbaRegressor): >>> y_positive = y.abs() + 1 # ensure positive targets for Poisson/Gamma >>> >>> # Normal distribution (default) - >>> gam_normal = GAMRegressor(distribution='Normal') + >>> gam_normal = GAMRegressor(dist='Normal') >>> gam_normal.fit(X, y) GAMRegressor(...) >>> >>> # Poisson distribution for count data - >>> gam_poisson = GAMRegressor(distribution='Poisson', link='log') + >>> gam_poisson = GAMRegressor(dist='Poisson', link='log') >>> gam_poisson.fit(X, y_positive) GAMRegressor(...) >>> >>> # Gamma distribution for positive continuous data - >>> gam_gamma = GAMRegressor(distribution='Gamma', link='log') + >>> gam_gamma = GAMRegressor(dist='Gamma', link='log') >>> gam_gamma.fit(X, y_positive) GAMRegressor(...) """ @@ -109,10 +109,16 @@ class GAMRegressor(BaseProbaRegressor): "tests:vm": True, } + # TODO (release 2.14.0) + # remove the 'distribution' argument from '__init__' signature + # remove the following 'if' check and deprecation warning + # de-indent the following 'else' check + def __init__( self, terms="auto", - distribution="normal", + distribution="deprecated", + dist="normal", link="identity", max_iter=100, tol=1e-4, @@ -122,6 +128,7 @@ def __init__( ): self.terms = terms self.distribution = distribution + self.dist = dist self.link = link self.max_iter = max_iter self.tol = tol @@ -131,6 +138,23 @@ def __init__( super().__init__() + # handle deprecation of distribution -> dist + if distribution != "deprecated": + from warnings import warn + + warn( + "in `GAMRegressor`, parameter 'distribution' " + "will be renamed to 'dist' in version 2.14.0. " + "To keep current behaviour and to silence this warning, " + "use 'dist' instead of 'distribution', " + "set dist explicitly via kwarg, and do not set distribution.", + category=DeprecationWarning, + stacklevel=2, + ) + self._dist = distribution + else: + self._dist = dist + def _fit(self, X, y): """Fit regressor to training data. @@ -158,7 +182,7 @@ def _fit(self, X, y): if callbacks is None: callbacks = ["deviance", "diffs"] - dist_name = self._get_distribution_name(self.distribution) + dist_name = self._get_distribution_name(self._dist) # Map common names to skpro distribution names dist_map = { diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index bb9e65f87..aeb522423 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -22,10 +22,10 @@ class GLMRegressor(BaseProbaRegressor): Parameters ---------- - family : string, default : "Normal" - The family parameter denotes the type of distribution + dist : string, default : "Normal" + The dist parameter denotes the type of distribution that will be used. - Available family/distributions are + Available distributions are 1."Normal" 2."Poisson" 3."Gamma" @@ -205,7 +205,7 @@ class GLMRegressor(BaseProbaRegressor): "y_inner_mtype": "pd_DataFrame_Table", } - def _str_to_sm_family(self, family, link): + def _str_to_sm_family(self, dist, link): """Convert the string to a statsmodel object. If the link function is also explicitly mentioned then include then @@ -232,16 +232,22 @@ def _str_to_sm_family(self, family, link): if link in links: link_function = links[link]() try: - return sm_fmly[family](link_function) + return sm_fmly[dist](link_function) except Exception: - msg = "Invalid link for family, default link will be used" + msg = "Invalid link for distribution, default link will be used" warn(msg) - return sm_fmly[family]() + return sm_fmly[dist]() + + # TODO (release 2.14.0) + # remove the 'family' argument from '__init__' signature + # remove the following 'if' check and deprecation warning + # de-indent the following 'else' check def __init__( self, - family="Normal", + family="deprecated", + dist="Normal", link=None, offset_var=None, exposure_var=None, @@ -262,6 +268,7 @@ def __init__( super().__init__() self.family = family + self.dist = dist self.link = link self.offset_var = offset_var self.exposure_var = exposure_var @@ -279,7 +286,23 @@ def __init__( self.max_start_irls = max_start_irls self.add_constant = add_constant - self._family = self.family + # handle deprecation of family -> dist + if family != "deprecated": + from warnings import warn + + warn( + "in `GLMRegressor`, parameter 'family' " + "will be renamed to 'dist' in version 2.14.0. " + "To keep current behaviour and to silence this warning, " + "use 'dist' instead of 'family', " + "set dist explicitly via kwarg, and do not set family.", + category=DeprecationWarning, + stacklevel=2, + ) + self._dist = family + else: + self._dist = dist + self._link = self.link self._offset_var = self.offset_var self._exposure_var = self.exposure_var @@ -325,12 +348,12 @@ def _fit(self, X, y): # remove the offset and exposure columns which # was inserted to maintain the shape - family = self._family + dist = self._dist link = self._link # ensure numerical stability for Gamma by injecting an intercept self._auto_added_constant = False - if family == "Gamma" and not self._add_constant: + if dist == "Gamma" and not self._add_constant: self._add_constant = True self._auto_added_constant = True @@ -341,7 +364,7 @@ def _fit(self, X, y): y_col = y.columns - sm_family = self._str_to_sm_family(family=family, link=link) + sm_family = self._str_to_sm_family(dist=dist, link=link) glm_estimator = GLM( endog=y, @@ -434,7 +457,7 @@ def _predict(self, X): return y_pred - def _params_sm_to_skpro(self, y_predictions_df, index, columns, family): + def _params_sm_to_skpro(self, y_predictions_df, index, columns, dist): """Convert the statsmodels output to equivalent skpro distribution.""" from skpro.distributions.gamma import Gamma from skpro.distributions.normal import Normal @@ -449,8 +472,8 @@ def _params_sm_to_skpro(self, y_predictions_df, index, columns, family): params = {} skp_dist = Normal - if family in skpro_distr: - skp_dist = skpro_distr[family] + if dist in skpro_distr: + skp_dist = skpro_distr[dist] if skp_dist == Normal: y_mu = y_predictions_df["mean"].rename("mu").to_frame() @@ -512,11 +535,11 @@ def _predict_proba(self, X): y_predictions_df = self.glm_fit_.get_prediction(X_).summary_frame() # convert the returned values to skpro equivalent distribution - family = self._family + dist = self._dist index = X_.index columns = y_column - y_pred = self._params_sm_to_skpro(y_predictions_df, index, columns, family) + y_pred = self._params_sm_to_skpro(y_predictions_df, index, columns, dist) return y_pred def _prep_x(self, X, offset_var, exposure_var, rtn_off_exp_arr): diff --git a/skpro/regression/linear/_glum.py b/skpro/regression/linear/_glum.py index 77be264aa..f1a625923 100644 --- a/skpro/regression/linear/_glum.py +++ b/skpro/regression/linear/_glum.py @@ -24,14 +24,14 @@ class GlumRegressor(BaseProbaRegressor): Parameters ---------- - family : str or ExponentialDispersionModel, default='normal' + dist : str or ExponentialDispersionModel, default='normal' The distributional assumption of the GLM. One of: 'binomial', 'gamma', 'gaussian', 'inverse.gaussian', 'normal', 'poisson', 'tweedie', 'negative.binomial'. link : str or Link, default='auto' The link function of the GLM. - If 'auto', the canonical link for the family is used. - Supported links depend on the family. Common options include: + If 'auto', the canonical link for the distribution is used. + Supported links depend on the distribution. Common options include: 'identity', 'log', 'logit', 'probit', 'cloglog', 'pow', 'nbinom'. alpha : float or array-like, default=None Constant that multiplies the penalty terms. @@ -131,9 +131,15 @@ class GlumRegressor(BaseProbaRegressor): "tests:vm": True, } + # TODO (release 2.14.0) + # remove the 'family' argument from '__init__' signature + # remove the following 'if' check and deprecation warning + # de-indent the following 'else' check + def __init__( self, - family="normal", + family="deprecated", + dist="normal", link="auto", alpha=None, l1_ratio=0, @@ -163,6 +169,7 @@ def __init__( expected_information=False, ): self.family = family + self.dist = dist self.link = link self.alpha = alpha self.l1_ratio = l1_ratio @@ -193,14 +200,31 @@ def __init__( super().__init__() + # handle deprecation of family -> dist + if family != "deprecated": + from warnings import warn + + warn( + "in `GlumRegressor`, parameter 'family' " + "will be renamed to 'dist' in version 2.14.0. " + "To keep current behaviour and to silence this warning, " + "use 'dist' instead of 'family', " + "set dist explicitly via kwarg, and do not set family.", + category=DeprecationWarning, + stacklevel=2, + ) + self._dist = family + else: + self._dist = dist + @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator.""" - params1 = {"family": "normal"} - params2 = {"family": "gamma", "link": "log"} - params3 = {"family": "poisson"} - params4 = {"family": "negative.binomial"} - params5 = {"family": "normal", "alpha": 0.1, "l1_ratio": 0.5} + params1 = {"dist": "normal"} + params2 = {"dist": "gamma", "link": "log"} + params3 = {"dist": "poisson"} + params4 = {"dist": "negative.binomial"} + params5 = {"dist": "normal", "alpha": 0.1, "l1_ratio": 0.5} return [params1, params2, params3, params4, params5] def _fit(self, X, y): @@ -221,7 +245,7 @@ def _fit(self, X, y): from glum import GeneralizedLinearRegressor self.estimator_ = GeneralizedLinearRegressor( - family=self.family, + family=self._dist, link=self.link, alpha=self.alpha, l1_ratio=self.l1_ratio, @@ -312,30 +336,30 @@ def _predict_proba(self, X): The predicted distribution. """ mu = self._predict(X) - family = self.family + dist = self._dist - if isinstance(family, str): - family_str = family.lower() + if isinstance(dist, str): + dist_str = dist.lower() else: - # If family is an object, we need to infer the type + # If dist is an object, we need to infer the type # This is tricky, but let's assume string for now as per init - family_str = str(family).lower() + dist_str = str(dist).lower() - if "normal" in family_str or "gaussian" in family_str: + if "normal" in dist_str or "gaussian" in dist_str: # Normal distribution # Variance = dispersion * v(mu) = dispersion * 1 = dispersion # So sigma = sqrt(dispersion) sigma = np.sqrt(self.dispersion_) return Normal(mu=mu, sigma=sigma, index=X.index, columns=self._y_cols) - elif "poisson" in family_str: + elif "poisson" in dist_str: # Poisson distribution # skpro Poisson takes mu. # If dispersion != 1, it's not standard Poisson. # But skpro Poisson is standard. return Poisson(mu=mu, index=X.index, columns=self._y_cols) - elif "gamma" in family_str: + elif "gamma" in dist_str: # Gamma distribution # mu = alpha / beta # var = alpha / beta^2 = dispersion * mu^2 @@ -345,20 +369,20 @@ def _predict_proba(self, X): beta = 1.0 / (self.dispersion_ * mu) return Gamma(alpha=alpha, beta=beta, index=X.index, columns=self._y_cols) - elif "negative.binomial" in family_str: + elif "negative.binomial" in dist_str: # Negative Binomial # var = mu + theta * mu^2 # skpro NB takes mu and alpha (where var = mu + mu^2/alpha) # So alpha_skpro = 1/theta_glum - # We need to extract theta from family string or object - # If family is string like 'negative.binomial(1.5)', theta is 1.5 - # If family is 'negative.binomial', theta is default 1.0? + # We need to extract theta from dist string or object + # If dist is string like 'negative.binomial(1.5)', theta is 1.5 + # If dist is 'negative.binomial', theta is default 1.0? theta = 1.0 - if "(" in family_str: + if "(" in dist_str: try: - theta = float(family_str.split("(")[1].split(")")[0]) + theta = float(dist_str.split("(")[1].split(")")[0]) except ValueError: pass @@ -376,6 +400,6 @@ def _predict_proba(self, X): else: raise NotImplementedError( - f"Distribution for family '{family}' not implemented in " + f"Distribution for family '{dist}' not implemented in " "skpro interface." ) diff --git a/skpro/regression/ondil.py b/skpro/regression/ondil.py index 261f85a75..ecb5ce855 100644 --- a/skpro/regression/ondil.py +++ b/skpro/regression/ondil.py @@ -23,7 +23,7 @@ class OndilOnlineGamlss(BaseProbaRegressor): Parameters ---------- - distribution : str, default="Normal" + dist : str, default="Normal" Name of distribution to expose via skpro. This is used to map parameter names returned by the upstream estimator to skpro's distribution constructors. Common value is "Normal". @@ -48,23 +48,48 @@ class OndilOnlineGamlss(BaseProbaRegressor): "y_inner_mtype": "pd_DataFrame_Table", } - def __init__(self, distribution="Normal", ondil_init_params=None): + # TODO (release 2.14.0) + # remove the 'distribution' argument from '__init__' signature + # remove the following 'if' check and deprecation warning + # de-indent the following 'else' check + + def __init__( + self, distribution="deprecated", dist="Normal", ondil_init_params=None + ): """Initialize OndilOnlineGamlss. Parameters ---------- - distribution : str, default="Normal" + dist : str, default="Normal" Name of distribution to expose via skpro. ondil_init_params : dict, optional Parameters to forward to ondil's OnlineGamlss constructor. """ self.distribution = distribution + self.dist = dist self.ondil_init_params = ondil_init_params # explicit dict of kwargs forwarded to the ondil constructor. self._ondil_kwargs = dict(ondil_init_params or {}) super().__init__() + # handle deprecation of distribution -> dist + if distribution != "deprecated": + from warnings import warn + + warn( + "in `OndilOnlineGamlss`, parameter 'distribution' " + "will be renamed to 'dist' in version 2.14.0. " + "To keep current behaviour and to silence this warning, " + "use 'dist' instead of 'distribution', " + "set dist explicitly via kwarg, and do not set distribution.", + category=DeprecationWarning, + stacklevel=2, + ) + self._dist = distribution + else: + self._dist = dist + def _fit(self, X, y): """Fit the underlying ondil OnlineGamlss estimator. @@ -188,7 +213,7 @@ def _predict_proba(self, X): raise TypeError("Unrecognized predict output from ondil: %s" % e) # decide mapping based on requested distribution - dist = self.distribution + dist = self._dist # import skpro distributions lazily distr_mod = importlib.import_module("skpro.distributions") diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 54e09b4a7..4a3093999 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -75,19 +75,19 @@ class ResidualDouble(BaseProbaRegressor): * ``"squared"`` = squared residuals * if transformer, applies ``fit_transform`` to batch of signed residuals - distr_type : str or BaseDistribution, default = "Normal" + dist : str or BaseDistribution, default = "Normal" type of distribution to predict str options are "Normal", "Laplace", "Cauchy", "t" distr_loc_scale_name : tuple of length two, default = ("loc", "scale") names of the parameters in the distribution to use for location and scale - * if ``distr_type`` is a string, this is overridden to the correct parameters - * if ``distr_type`` is a BaseDistribution, this is used to determine the + * if ``dist`` is a string, this is overridden to the correct parameters + * if ``dist`` is a BaseDistribution, this is used to determine the location and scale parameters that the predictions are passed to distr_params : dict, default = {} parameters to pass to the distribution - must be valid parameters of ``distr_type``, if ``BaseDistribution``; + must be valid parameters of ``dist``, if ``BaseDistribution``; must be default or dict with key ``df``, if ``t`` distribution use_y_pred : bool, default=False whether to use the predicted location in predicting the scale of the residual @@ -125,12 +125,18 @@ class ResidualDouble(BaseProbaRegressor): _tags = {"capability:missing": True} + # TODO (release 2.14.0) + # remove the 'distr_type' argument from '__init__' signature + # remove the following 'if' check and deprecation warning + # de-indent the following 'else' check + def __init__( self, estimator, estimator_resid=None, residual_trafo="absolute", - distr_type="Normal", + distr_type="deprecated", + dist="Normal", distr_loc_scale_name=None, distr_params=None, use_y_pred=False, @@ -140,6 +146,7 @@ def __init__( self.estimator = estimator self.estimator_resid = estimator_resid self.residual_trafo = residual_trafo + self.dist = dist self.distr_type = distr_type self.distr_loc_scale_name = distr_loc_scale_name self.distr_params = distr_params @@ -149,6 +156,23 @@ def __init__( super().__init__() + # handle deprecation of distr_type -> dist + if distr_type != "deprecated": + from warnings import warn + + warn( + "in `ResidualDouble`, parameter 'distr_type' " + "will be renamed to 'dist' in version 2.14.0. " + "To keep current behaviour and to silence this warning, " + "use 'dist' instead of 'distr_type', " + "set dist explicitly via kwarg, and do not set distr_type.", + category=DeprecationWarning, + stacklevel=2, + ) + self._dist = distr_type + else: + self._dist = dist + self.estimator_ = clone(estimator) if estimator_resid is None: @@ -298,7 +322,7 @@ def _predict_proba(self, X): est = self.estimator_ est_r = self.estimator_resid_ use_y_pred = self.use_y_pred - distr_type = self.distr_type + distr_type = self._dist distr_loc_scale_name = self.distr_loc_scale_name distr_params = self.distr_params min_scale = self.min_scale @@ -395,14 +419,14 @@ def get_test_params(cls, parameter_set="default"): "min_scale": 1e-7, "residual_trafo": "squared", "use_y_pred": True, - "distr_type": "Laplace", + "dist": "Laplace", } params3 = { "estimator": LinearRegression(), "estimator_resid": RandomForestRegressor(), "min_scale": 1e-6, "use_y_pred": True, - "distr_type": "t", + "dist": "t", "distr_params": {"df": 3}, "cv": KFold(n_splits=3), }