diff --git a/skpro/regression/cyclic_boosting.py b/skpro/regression/cyclic_boosting.py
index e536f6e47..1a7a07d08 100644
--- a/skpro/regression/cyclic_boosting.py
+++ b/skpro/regression/cyclic_boosting.py
@@ -83,11 +83,10 @@ class CyclicBoosting(BaseProbaRegressor):
         be on a bounded interval, with support between ``lower`` and ``upper``.
     maximal_iterations : int, default=10
         maximum number of iterations for the cyclic boosting algorithm
-    dist_type: str, one of ``'normal'`` (default), ``'logistic'``
-        inner base distribution to use for the Johnson QPD, i.e., before
-        arcosh and similar transformations.
-        Available options are ``'normal'`` (default), ``'logistic'``,
-        or ``'sinhlogistic'``.
+    dist: str, default='normal',
+        One of ``'normal'`` or ``'logistic'``inner base distribution to use for
+        the Johnson QPD, i.e., before arcosh and similar transformations. Available
+        options are ``'normal'`` (default), ``'logistic'`` or ``'sinhlogistic'``.
 
     Attributes
     ----------
@@ -133,6 +132,11 @@ class CyclicBoosting(BaseProbaRegressor):
         "tests:vm": True,  # requires its own test VM to run
     }
 
+    # TODO (release 2.14.0)
+    # remove the 'dist_type' argument from '__init__' signature
+    # remove the following 'if' check and deprecation warning
+    # de-indent the following 'else' check
+
     def __init__(
         self,
         feature_groups: Union[List[str], List[Tuple[str, ...]], None] = None,
@@ -142,7 +146,8 @@ def __init__(
         lower: Union[float, None] = None,
         upper: Union[float, None] = None,
         maximal_iterations=10,
-        dist_type: Union[str, None] = "normal",
+        dist_type: Union[str, None] = "deprecated",
+        dist: Union[str, None] = "normal",
         dist_shape: Union[float, None] = 0.0,
     ):
         self.feature_groups = feature_groups
@@ -153,10 +158,28 @@ def __init__(
         self.upper = upper
         self.maximal_iterations = maximal_iterations
         self.dist_type = dist_type
+        self.dist = dist
         self.dist_shape = dist_shape
 
         super().__init__()
 
+        # handle deprecation of dist_type -> dist
+        if dist_type != "deprecated":
+            from warnings import warn
+
+            warn(
+                "in `CyclicBoosting`, parameter 'dist_type' "
+                "will be renamed to 'dist' in version 2.14.0. "
+                "To keep current behaviour and to silence this warning, "
+                "use 'dist' instead of 'dist_type', "
+                "set dist explicitly via kwarg, and do not set dist_type.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            self._dist = dist_type
+        else:
+            self._dist = dist
+
         self.quantiles = [self.alpha, 0.5, 1 - self.alpha]
         self.quantile_values = list()
         self.quantile_est = list()
@@ -315,7 +338,7 @@ def _predict_proba(self, X):
             "qv_high": self.quantile_values[2].reshape(-1, 1),
             "lower": self.lower,
             "upper": self.upper,
-            "base_dist": self.dist_type,
+            "base_dist": self._dist,
             "index": index,
             "columns": y_cols,
         }
diff --git a/skpro/regression/gam/_gam.py b/skpro/regression/gam/_gam.py
index 21a80003d..a4ec1a1eb 100644
--- a/skpro/regression/gam/_gam.py
+++ b/skpro/regression/gam/_gam.py
@@ -30,7 +30,7 @@ class GAMRegressor(BaseProbaRegressor):
         By default a univariate spline term will be allocated for each feature.
         Can be a ``pygam`` terms expression for custom model specification.
 
-    distribution : str or pygam.Distribution, optional (default='Normal')
+    dist : str or pygam.Distribution, optional (default='Normal')
         Distribution family to use in the model.
         Supported strings (case-insensitive):
 
@@ -81,17 +81,17 @@ class GAMRegressor(BaseProbaRegressor):
     >>> y_positive = y.abs() + 1  # ensure positive targets for Poisson/Gamma
     >>>
     >>> # Normal distribution (default)
-    >>> gam_normal = GAMRegressor(distribution='Normal')
+    >>> gam_normal = GAMRegressor(dist='Normal')
     >>> gam_normal.fit(X, y)
     GAMRegressor(...)
     >>>
     >>> # Poisson distribution for count data
-    >>> gam_poisson = GAMRegressor(distribution='Poisson', link='log')
+    >>> gam_poisson = GAMRegressor(dist='Poisson', link='log')
     >>> gam_poisson.fit(X, y_positive)
     GAMRegressor(...)
     >>>
     >>> # Gamma distribution for positive continuous data
-    >>> gam_gamma = GAMRegressor(distribution='Gamma', link='log')
+    >>> gam_gamma = GAMRegressor(dist='Gamma', link='log')
     >>> gam_gamma.fit(X, y_positive)
     GAMRegressor(...)
     """
@@ -109,10 +109,16 @@ class GAMRegressor(BaseProbaRegressor):
         "tests:vm": True,
     }
 
+    # TODO (release 2.14.0)
+    # remove the 'distribution' argument from '__init__' signature
+    # remove the following 'if' check and deprecation warning
+    # de-indent the following 'else' check
+
     def __init__(
         self,
         terms="auto",
-        distribution="normal",
+        distribution="deprecated",
+        dist="normal",
         link="identity",
         max_iter=100,
         tol=1e-4,
@@ -122,6 +128,7 @@ def __init__(
     ):
         self.terms = terms
         self.distribution = distribution
+        self.dist = dist
         self.link = link
         self.max_iter = max_iter
         self.tol = tol
@@ -131,6 +138,23 @@ def __init__(
 
         super().__init__()
 
+        # handle deprecation of distribution -> dist
+        if distribution != "deprecated":
+            from warnings import warn
+
+            warn(
+                "in `GAMRegressor`, parameter 'distribution' "
+                "will be renamed to 'dist' in version 2.14.0. "
+                "To keep current behaviour and to silence this warning, "
+                "use 'dist' instead of 'distribution', "
+                "set dist explicitly via kwarg, and do not set distribution.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            self._dist = distribution
+        else:
+            self._dist = dist
+
     def _fit(self, X, y):
         """Fit regressor to training data.
 
@@ -158,7 +182,7 @@ def _fit(self, X, y):
         if callbacks is None:
             callbacks = ["deviance", "diffs"]
 
-        dist_name = self._get_distribution_name(self.distribution)
+        dist_name = self._get_distribution_name(self._dist)
 
         # Map common names to skpro distribution names
         dist_map = {
diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py
index bb9e65f87..aeb522423 100644
--- a/skpro/regression/linear/_glm.py
+++ b/skpro/regression/linear/_glm.py
@@ -22,10 +22,10 @@ class GLMRegressor(BaseProbaRegressor):
 
     Parameters
     ----------
-    family : string, default : "Normal"
-        The family parameter denotes the type of distribution
+    dist : string, default : "Normal"
+        The dist parameter denotes the type of distribution
         that will be used.
-        Available family/distributions are
+        Available distributions are
         1."Normal"
         2."Poisson"
         3."Gamma"
@@ -205,7 +205,7 @@ class GLMRegressor(BaseProbaRegressor):
         "y_inner_mtype": "pd_DataFrame_Table",
     }
 
-    def _str_to_sm_family(self, family, link):
+    def _str_to_sm_family(self, dist, link):
         """Convert the string to a statsmodel object.
 
         If the link function is also explicitly mentioned then include then
@@ -232,16 +232,22 @@ def _str_to_sm_family(self, family, link):
         if link in links:
             link_function = links[link]()
             try:
-                return sm_fmly[family](link_function)
+                return sm_fmly[dist](link_function)
             except Exception:
-                msg = "Invalid link for family, default link will be used"
+                msg = "Invalid link for distribution, default link will be used"
                 warn(msg)
 
-        return sm_fmly[family]()
+        return sm_fmly[dist]()
+
+    # TODO (release 2.14.0)
+    # remove the 'family' argument from '__init__' signature
+    # remove the following 'if' check and deprecation warning
+    # de-indent the following 'else' check
 
     def __init__(
         self,
-        family="Normal",
+        family="deprecated",
+        dist="Normal",
         link=None,
         offset_var=None,
         exposure_var=None,
@@ -262,6 +268,7 @@ def __init__(
         super().__init__()
 
         self.family = family
+        self.dist = dist
         self.link = link
         self.offset_var = offset_var
         self.exposure_var = exposure_var
@@ -279,7 +286,23 @@ def __init__(
         self.max_start_irls = max_start_irls
         self.add_constant = add_constant
 
-        self._family = self.family
+        # handle deprecation of family -> dist
+        if family != "deprecated":
+            from warnings import warn
+
+            warn(
+                "in `GLMRegressor`, parameter 'family' "
+                "will be renamed to 'dist' in version 2.14.0. "
+                "To keep current behaviour and to silence this warning, "
+                "use 'dist' instead of 'family', "
+                "set dist explicitly via kwarg, and do not set family.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            self._dist = family
+        else:
+            self._dist = dist
+
         self._link = self.link
         self._offset_var = self.offset_var
         self._exposure_var = self.exposure_var
@@ -325,12 +348,12 @@ def _fit(self, X, y):
 
         # remove the offset and exposure columns which
         # was inserted to maintain the shape
-        family = self._family
+        dist = self._dist
         link = self._link
 
         # ensure numerical stability for Gamma by injecting an intercept
         self._auto_added_constant = False
-        if family == "Gamma" and not self._add_constant:
+        if dist == "Gamma" and not self._add_constant:
             self._add_constant = True
             self._auto_added_constant = True
 
@@ -341,7 +364,7 @@ def _fit(self, X, y):
 
         y_col = y.columns
 
-        sm_family = self._str_to_sm_family(family=family, link=link)
+        sm_family = self._str_to_sm_family(dist=dist, link=link)
 
         glm_estimator = GLM(
             endog=y,
@@ -434,7 +457,7 @@ def _predict(self, X):
 
         return y_pred
 
-    def _params_sm_to_skpro(self, y_predictions_df, index, columns, family):
+    def _params_sm_to_skpro(self, y_predictions_df, index, columns, dist):
         """Convert the statsmodels output to equivalent skpro distribution."""
         from skpro.distributions.gamma import Gamma
         from skpro.distributions.normal import Normal
@@ -449,8 +472,8 @@ def _params_sm_to_skpro(self, y_predictions_df, index, columns, family):
         params = {}
         skp_dist = Normal
 
-        if family in skpro_distr:
-            skp_dist = skpro_distr[family]
+        if dist in skpro_distr:
+            skp_dist = skpro_distr[dist]
 
         if skp_dist == Normal:
             y_mu = y_predictions_df["mean"].rename("mu").to_frame()
@@ -512,11 +535,11 @@ def _predict_proba(self, X):
         y_predictions_df = self.glm_fit_.get_prediction(X_).summary_frame()
 
         # convert the returned values to skpro equivalent distribution
-        family = self._family
+        dist = self._dist
         index = X_.index
         columns = y_column
 
-        y_pred = self._params_sm_to_skpro(y_predictions_df, index, columns, family)
+        y_pred = self._params_sm_to_skpro(y_predictions_df, index, columns, dist)
         return y_pred
 
     def _prep_x(self, X, offset_var, exposure_var, rtn_off_exp_arr):
diff --git a/skpro/regression/linear/_glum.py b/skpro/regression/linear/_glum.py
index 77be264aa..f1a625923 100644
--- a/skpro/regression/linear/_glum.py
+++ b/skpro/regression/linear/_glum.py
@@ -24,14 +24,14 @@ class GlumRegressor(BaseProbaRegressor):
 
     Parameters
     ----------
-    family : str or ExponentialDispersionModel, default='normal'
+    dist : str or ExponentialDispersionModel, default='normal'
         The distributional assumption of the GLM.
         One of: 'binomial', 'gamma', 'gaussian', 'inverse.gaussian',
         'normal', 'poisson', 'tweedie', 'negative.binomial'.
     link : str or Link, default='auto'
         The link function of the GLM.
-        If 'auto', the canonical link for the family is used.
-        Supported links depend on the family. Common options include:
+        If 'auto', the canonical link for the distribution is used.
+        Supported links depend on the distribution. Common options include:
         'identity', 'log', 'logit', 'probit', 'cloglog', 'pow', 'nbinom'.
     alpha : float or array-like, default=None
         Constant that multiplies the penalty terms.
@@ -131,9 +131,15 @@ class GlumRegressor(BaseProbaRegressor):
         "tests:vm": True,
     }
 
+    # TODO (release 2.14.0)
+    # remove the 'family' argument from '__init__' signature
+    # remove the following 'if' check and deprecation warning
+    # de-indent the following 'else' check
+
     def __init__(
         self,
-        family="normal",
+        family="deprecated",
+        dist="normal",
         link="auto",
         alpha=None,
         l1_ratio=0,
@@ -163,6 +169,7 @@ def __init__(
         expected_information=False,
     ):
         self.family = family
+        self.dist = dist
         self.link = link
         self.alpha = alpha
         self.l1_ratio = l1_ratio
@@ -193,14 +200,31 @@ def __init__(
 
         super().__init__()
 
+        # handle deprecation of family -> dist
+        if family != "deprecated":
+            from warnings import warn
+
+            warn(
+                "in `GlumRegressor`, parameter 'family' "
+                "will be renamed to 'dist' in version 2.14.0. "
+                "To keep current behaviour and to silence this warning, "
+                "use 'dist' instead of 'family', "
+                "set dist explicitly via kwarg, and do not set family.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            self._dist = family
+        else:
+            self._dist = dist
+
     @classmethod
     def get_test_params(cls, parameter_set="default"):
         """Return testing parameter settings for the estimator."""
-        params1 = {"family": "normal"}
-        params2 = {"family": "gamma", "link": "log"}
-        params3 = {"family": "poisson"}
-        params4 = {"family": "negative.binomial"}
-        params5 = {"family": "normal", "alpha": 0.1, "l1_ratio": 0.5}
+        params1 = {"dist": "normal"}
+        params2 = {"dist": "gamma", "link": "log"}
+        params3 = {"dist": "poisson"}
+        params4 = {"dist": "negative.binomial"}
+        params5 = {"dist": "normal", "alpha": 0.1, "l1_ratio": 0.5}
         return [params1, params2, params3, params4, params5]
 
     def _fit(self, X, y):
@@ -221,7 +245,7 @@ def _fit(self, X, y):
         from glum import GeneralizedLinearRegressor
 
         self.estimator_ = GeneralizedLinearRegressor(
-            family=self.family,
+            family=self._dist,
             link=self.link,
             alpha=self.alpha,
             l1_ratio=self.l1_ratio,
@@ -312,30 +336,30 @@ def _predict_proba(self, X):
             The predicted distribution.
         """
         mu = self._predict(X)
-        family = self.family
+        dist = self._dist
 
-        if isinstance(family, str):
-            family_str = family.lower()
+        if isinstance(dist, str):
+            dist_str = dist.lower()
         else:
-            # If family is an object, we need to infer the type
+            # If dist is an object, we need to infer the type
             # This is tricky, but let's assume string for now as per init
-            family_str = str(family).lower()
+            dist_str = str(dist).lower()
 
-        if "normal" in family_str or "gaussian" in family_str:
+        if "normal" in dist_str or "gaussian" in dist_str:
             # Normal distribution
             # Variance = dispersion * v(mu) = dispersion * 1 = dispersion
             # So sigma = sqrt(dispersion)
             sigma = np.sqrt(self.dispersion_)
             return Normal(mu=mu, sigma=sigma, index=X.index, columns=self._y_cols)
 
-        elif "poisson" in family_str:
+        elif "poisson" in dist_str:
             # Poisson distribution
             # skpro Poisson takes mu.
             # If dispersion != 1, it's not standard Poisson.
             # But skpro Poisson is standard.
             return Poisson(mu=mu, index=X.index, columns=self._y_cols)
 
-        elif "gamma" in family_str:
+        elif "gamma" in dist_str:
             # Gamma distribution
             # mu = alpha / beta
             # var = alpha / beta^2 = dispersion * mu^2
@@ -345,20 +369,20 @@ def _predict_proba(self, X):
             beta = 1.0 / (self.dispersion_ * mu)
             return Gamma(alpha=alpha, beta=beta, index=X.index, columns=self._y_cols)
 
-        elif "negative.binomial" in family_str:
+        elif "negative.binomial" in dist_str:
             # Negative Binomial
             # var = mu + theta * mu^2
             # skpro NB takes mu and alpha (where var = mu + mu^2/alpha)
             # So alpha_skpro = 1/theta_glum
 
-            # We need to extract theta from family string or object
-            # If family is string like 'negative.binomial(1.5)', theta is 1.5
-            # If family is 'negative.binomial', theta is default 1.0?
+            # We need to extract theta from dist string or object
+            # If dist is string like 'negative.binomial(1.5)', theta is 1.5
+            # If dist is 'negative.binomial', theta is default 1.0?
 
             theta = 1.0
-            if "(" in family_str:
+            if "(" in dist_str:
                 try:
-                    theta = float(family_str.split("(")[1].split(")")[0])
+                    theta = float(dist_str.split("(")[1].split(")")[0])
                 except ValueError:
                     pass
 
@@ -376,6 +400,6 @@ def _predict_proba(self, X):
 
         else:
             raise NotImplementedError(
-                f"Distribution for family '{family}' not implemented in "
+                f"Distribution for family '{dist}' not implemented in "
                 "skpro interface."
             )
diff --git a/skpro/regression/ondil.py b/skpro/regression/ondil.py
index 261f85a75..ecb5ce855 100644
--- a/skpro/regression/ondil.py
+++ b/skpro/regression/ondil.py
@@ -23,7 +23,7 @@ class OndilOnlineGamlss(BaseProbaRegressor):
 
     Parameters
     ----------
-    distribution : str, default="Normal"
+    dist : str, default="Normal"
         Name of distribution to expose via skpro. This is used to map
         parameter names returned by the upstream estimator to skpro's
         distribution constructors. Common value is "Normal".
@@ -48,23 +48,48 @@ class OndilOnlineGamlss(BaseProbaRegressor):
         "y_inner_mtype": "pd_DataFrame_Table",
     }
 
-    def __init__(self, distribution="Normal", ondil_init_params=None):
+    # TODO (release 2.14.0)
+    # remove the 'distribution' argument from '__init__' signature
+    # remove the following 'if' check and deprecation warning
+    # de-indent the following 'else' check
+
+    def __init__(
+        self, distribution="deprecated", dist="Normal", ondil_init_params=None
+    ):
         """Initialize OndilOnlineGamlss.
 
         Parameters
         ----------
-        distribution : str, default="Normal"
+        dist : str, default="Normal"
             Name of distribution to expose via skpro.
         ondil_init_params : dict, optional
             Parameters to forward to ondil's OnlineGamlss constructor.
         """
         self.distribution = distribution
+        self.dist = dist
         self.ondil_init_params = ondil_init_params
         # explicit dict of kwargs forwarded to the ondil constructor.
         self._ondil_kwargs = dict(ondil_init_params or {})
 
         super().__init__()
 
+        # handle deprecation of distribution -> dist
+        if distribution != "deprecated":
+            from warnings import warn
+
+            warn(
+                "in `OndilOnlineGamlss`, parameter 'distribution' "
+                "will be renamed to 'dist' in version 2.14.0. "
+                "To keep current behaviour and to silence this warning, "
+                "use 'dist' instead of 'distribution', "
+                "set dist explicitly via kwarg, and do not set distribution.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            self._dist = distribution
+        else:
+            self._dist = dist
+
     def _fit(self, X, y):
         """Fit the underlying ondil OnlineGamlss estimator.
 
@@ -188,7 +213,7 @@ def _predict_proba(self, X):
                 raise TypeError("Unrecognized predict output from ondil: %s" % e)
 
         # decide mapping based on requested distribution
-        dist = self.distribution
+        dist = self._dist
         # import skpro distributions lazily
         distr_mod = importlib.import_module("skpro.distributions")
 
diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py
index 54e09b4a7..4a3093999 100644
--- a/skpro/regression/residual.py
+++ b/skpro/regression/residual.py
@@ -75,19 +75,19 @@ class ResidualDouble(BaseProbaRegressor):
         * ``"squared"`` = squared residuals
         * if transformer, applies ``fit_transform`` to batch of signed residuals
 
-    distr_type : str or BaseDistribution, default = "Normal"
+    dist : str or BaseDistribution, default = "Normal"
         type of distribution to predict
         str options are "Normal", "Laplace", "Cauchy", "t"
     distr_loc_scale_name : tuple of length two, default = ("loc", "scale")
         names of the parameters in the distribution to use for location and scale
 
-        * if ``distr_type`` is a string, this is overridden to the correct parameters
-        * if ``distr_type`` is a BaseDistribution, this is used to determine the
+        * if ``dist`` is a string, this is overridden to the correct parameters
+        * if ``dist`` is a BaseDistribution, this is used to determine the
           location and scale parameters that the predictions are passed to
 
     distr_params : dict, default = {}
         parameters to pass to the distribution
-        must be valid parameters of ``distr_type``, if ``BaseDistribution``;
+        must be valid parameters of ``dist``, if ``BaseDistribution``;
         must be default or dict with key ``df``, if ``t`` distribution
     use_y_pred : bool, default=False
         whether to use the predicted location in predicting the scale of the residual
@@ -125,12 +125,18 @@ class ResidualDouble(BaseProbaRegressor):
 
     _tags = {"capability:missing": True}
 
+    # TODO (release 2.14.0)
+    # remove the 'distr_type' argument from '__init__' signature
+    # remove the following 'if' check and deprecation warning
+    # de-indent the following 'else' check
+
     def __init__(
         self,
         estimator,
         estimator_resid=None,
         residual_trafo="absolute",
-        distr_type="Normal",
+        distr_type="deprecated",
+        dist="Normal",
         distr_loc_scale_name=None,
         distr_params=None,
         use_y_pred=False,
@@ -140,6 +146,7 @@ def __init__(
         self.estimator = estimator
         self.estimator_resid = estimator_resid
         self.residual_trafo = residual_trafo
+        self.dist = dist
         self.distr_type = distr_type
         self.distr_loc_scale_name = distr_loc_scale_name
         self.distr_params = distr_params
@@ -149,6 +156,23 @@ def __init__(
 
         super().__init__()
 
+        # handle deprecation of distr_type -> dist
+        if distr_type != "deprecated":
+            from warnings import warn
+
+            warn(
+                "in `ResidualDouble`, parameter 'distr_type' "
+                "will be renamed to 'dist' in version 2.14.0. "
+                "To keep current behaviour and to silence this warning, "
+                "use 'dist' instead of 'distr_type', "
+                "set dist explicitly via kwarg, and do not set distr_type.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            self._dist = distr_type
+        else:
+            self._dist = dist
+
         self.estimator_ = clone(estimator)
 
         if estimator_resid is None:
@@ -298,7 +322,7 @@ def _predict_proba(self, X):
         est = self.estimator_
         est_r = self.estimator_resid_
         use_y_pred = self.use_y_pred
-        distr_type = self.distr_type
+        distr_type = self._dist
         distr_loc_scale_name = self.distr_loc_scale_name
         distr_params = self.distr_params
         min_scale = self.min_scale
@@ -395,14 +419,14 @@ def get_test_params(cls, parameter_set="default"):
             "min_scale": 1e-7,
             "residual_trafo": "squared",
             "use_y_pred": True,
-            "distr_type": "Laplace",
+            "dist": "Laplace",
         }
         params3 = {
             "estimator": LinearRegression(),
             "estimator_resid": RandomForestRegressor(),
             "min_scale": 1e-6,
             "use_y_pred": True,
-            "distr_type": "t",
+            "dist": "t",
             "distr_params": {"df": 3},
             "cv": KFold(n_splits=3),
         }