diff --git a/skpro/distributions/base/_base.py b/skpro/distributions/base/_base.py index 6079530c5..2a342ec9b 100644 --- a/skpro/distributions/base/_base.py +++ b/skpro/distributions/base/_base.py @@ -5,6 +5,7 @@ __all__ = ["BaseDistribution"] +import textwrap from warnings import warn import numpy as np @@ -13,10 +14,140 @@ from skpro.base import BaseObject +# mapping of public methods to formula doc hooks +_DOC_METHODS = { + "pdf": "_pdf_formula_doc", + "cdf": "_cdf_formula_doc", + "log_pdf": "_log_pdf_formula_doc", + "pmf": "_pmf_formula_doc", + "log_pmf": "_log_pmf_formula_doc", + "ppf": "_ppf_formula_doc", + "surv": "_surv_formula_doc", + "haz": "_haz_formula_doc", + "mean": "_mean_formula_doc", + "var": "_var_formula_doc", + "energy": "_energy_formula_doc", + "pdfnorm": "_pdfnorm_formula_doc", +} + + +def _inject_formula_doc(base_doc, formula_doc): + """Inject formula_doc into base_doc at {formula_doc} placeholder.""" + if not base_doc or "{formula_doc}" not in base_doc: + return base_doc + + if formula_doc is None: + # Cleanly remove the placeholder if no formula is provided + return base_doc.replace(" {formula_doc}\n\n", "").replace( + "{formula_doc}", "" + ) + + # 1. Find exactly how many spaces are before {formula_doc} in the base docstring + lines = base_doc.split("\n") + indent_spaces = "" + for line in lines: + if "{formula_doc}" in line: + indent_spaces = line[: line.find("{formula_doc}")] + break + + # 2. Clean the user's formula (preserves relative indent inside the math block) + clean_formula = textwrap.dedent(formula_doc).strip() + + # 3. Add the base indentation to every new line in the formula + indented_formula = clean_formula.replace("\n", "\n" + indent_spaces) + + return base_doc.replace("{formula_doc}", indented_formula) + class BaseDistribution(BaseObject): """Base probability distribution.""" + # hooks for distribution-specific documentation + _pdf_formula_doc = None + _cdf_formula_doc = None + _log_pdf_formula_doc = None + _pmf_formula_doc = None + _log_pmf_formula_doc = None + _ppf_formula_doc = None + _surv_formula_doc = None + _haz_formula_doc = None + _mean_formula_doc = None + _var_formula_doc = None + _energy_formula_doc = None + _pdfnorm_formula_doc = None + + def __init_subclass__(cls, **kwargs): + """Inject distribution-specific math formulae into docstrings.""" + super().__init_subclass__(**kwargs) + + if cls is BaseDistribution: + return + + # Skip adapters that might behave weirdly + if cls.__name__.startswith("_BaseTF"): + return + + for method_name, hook_name in _DOC_METHODS.items(): + # ALWAYS use the pristine docstring from BaseDistribution as the template + base_method = getattr(BaseDistribution, method_name, None) + if base_method is None or base_method.__doc__ is None: + continue + + if "{formula_doc}" in base_method.__doc__: + formula_doc = getattr(cls, hook_name, None) + new_doc = _inject_formula_doc(base_method.__doc__, formula_doc) + + # Get the actual method we need to wrap + method = getattr(cls, method_name) + + # Factory function to avoid Python's late-binding loop closure bug + def _make_wrapper(original_method, new_docstring): + import functools + + # Unwrap to prevent deep wrapper chains from multi-level inheritance + while hasattr(original_method, "__wrapped__"): + original_method = original_method.__wrapped__ + + @functools.wraps(original_method) + def wrapper(self, *args, **kwargs_inner): + return original_method(self, *args, **kwargs_inner) + + wrapper.__doc__ = new_docstring + return wrapper + + # Safely attach the new wrapped method to the subclass + setattr(cls, method_name, _make_wrapper(method, new_doc)) + + @classmethod + def _has_implementation_of(cls, method): + """Check if method has a concrete implementation, ignoring docstring wrapper.""" + # 1. Ask the standard framework if it thinks the method is implemented + is_implemented = super()._has_implementation_of(method) + + if is_implemented: + # 2. If it says YES, let's peek underneath the wrapper (X-Ray Vision) + method_obj = getattr(cls, method, None) + + if hasattr(method_obj, "__wrapped__"): + base_method = getattr(BaseDistribution, method, None) + + # Unwrap the subclass method to find the real function + unwrapped = method_obj + while hasattr(unwrapped, "__wrapped__"): + unwrapped = unwrapped.__wrapped__ + + # Unwrap the base method just in case + if base_method is not None: + while hasattr(base_method, "__wrapped__"): + base_method = base_method.__wrapped__ + + # 3. If the real function underneath is exactly the Base default, + # then the subclass didn't write custom math. It's just our doc wrapper! + if unwrapped is base_method: + return False + + return is_implemented + # default tag values - these typically make the "safest" assumption _tags = { "object_type": "distribution", # type of object, e.g., 'distribution' @@ -712,6 +843,10 @@ def _boilerplate(self, method, columns=None, **kwargs): def pdf(self, x): r"""Probability density function. + {formula_doc} + + Let :math:`X` be a random variables with the distribution of ``self``, + Let :math:`X` be a random variables with the distribution of ``self``, taking values in ``(N, n)`` ``DataFrame``-s Let :math:`x\in \mathbb{R}^{N\times n}`. @@ -779,6 +914,10 @@ def _pdf(self, x): def log_pdf(self, x): r"""Logarithmic probability density function. + {formula_doc} + + Numerically more stable than calling pdf and then taking logartihms. + Numerically more stable than calling pdf and then taking logarithms. Let :math:`X` be a random variables with the distribution of ``self``, @@ -871,6 +1010,10 @@ def _approx_derivative(x, fun, h=1e-7): def pmf(self, x): r"""Probability mass function. + {formula_doc} + + Let :math:`X` be a random variables with the distribution of ``self``, + Let :math:`X` be a random variables with the distribution of ``self``, taking values in ``(N, n)`` ``DataFrame``-s Let :math:`x\in \mathbb{R}^{N\times n}`. @@ -927,6 +1070,10 @@ def _pmf(self, x): def log_pmf(self, x): r"""Logarithmic probability mass function. + {formula_doc} + + Numerically more stable than calling pmf and then taking logarithms. + Numerically more stable than calling pmf and then taking logarithms. Let :math:`X` be a random variables with the distribution of ``self``, @@ -983,6 +1130,10 @@ def _log_pmf(self, x): def cdf(self, x): r"""Cumulative distribution function. + {formula_doc} + + Let :math:`X` be a random variables with the distribution of ``self``, + Let :math:`X` be a random variables with the distribution of ``self``, taking values in ``(N, n)`` ``DataFrame``-s Let :math:`x\in \mathbb{R}^{N\times n}`. @@ -1026,6 +1177,10 @@ def _cdf(self, x): def surv(self, x): r"""Survival function. + {formula_doc} + + Let :math:`X` be a random variables with the distribution of ``self``, + Let :math:`X` be a random variables with the distribution of ``self``, taking values in ``(N, n)`` ``DataFrame``-s Let :math:`x\in \mathbb{R}^{N\times n}`. @@ -1060,6 +1215,9 @@ def _surv(self, x): def haz(self, x): r"""Hazard function. + {formula_doc} + + Let :math:`X` be a random variables with the distribution of ``self``, Let :math:`X` be a random variables with the distribution of ``self``, taking values in ``(N, n)`` ``DataFrame``-s Let :math:`x\in \mathbb{R}^{N\times n}`. @@ -1096,6 +1254,10 @@ def _haz(self, x): def ppf(self, p): r"""Quantile function = percent point function = inverse cdf. + {formula_doc} + + Let :math:`X` be a random variables with the distribution of ``self``, + Let :math:`X` be a random variables with the distribution of ``self``, taking values in ``(N, n)`` ``DataFrame``-s Let :math:`x\in \mathbb{R}^{N\times n}`. @@ -1189,6 +1351,10 @@ def opt_fun(x): def energy(self, x=None): r"""Energy of self, w.r.t. self or a constant frame x. + {formula_doc} + + Let :math:`X, Y` be i.i.d. random variables with the distribution of ``self``. + Let :math:`X, Y` be i.i.d. random variables with the distribution of ``self``. If ``x`` is ``None``, returns :math:`\mathbb{E}[|X-Y|]` (per row), @@ -1358,6 +1524,10 @@ def _sample_mean(self, spl): def mean(self): r"""Return expected value of the distribution. + {formula_doc} + + Let :math:`X` be a random variable with the distribution of ``self``. + Let :math:`X` be a random variable with the distribution of ``self``. Returns the expectation :math:`\mathbb{E}[X]` @@ -1400,6 +1570,10 @@ def _mean(self): def var(self): r"""Return element/entry-wise variance of the distribution. + {formula_doc} + + Let :math:`X` be a random variable with the distribution of ``self``. + Let :math:`X` be a random variable with the distribution of ``self``. Returns :math:`\mathbb{V}[X] = \mathbb{E}\left(X - \mathbb{E}[X]\right)^2`, where the square is element-wise. @@ -1451,6 +1625,10 @@ def _var(self): def pdfnorm(self, a=2): r"""a-norm of pdf, defaults to 2-norm. + {formula_doc} + + computes a-norm of the entry marginal pdf, i.e., + computes a-norm of the entry marginal pdf, i.e., :math:`\mathbb{E}[p_X(X)^{a-1}] = \int p(x)^a dx`, where :math:`X` is a random variable distributed according to the entry marginal diff --git a/skpro/distributions/exponential.py b/skpro/distributions/exponential.py index 1829cb849..3a0ad500d 100644 --- a/skpro/distributions/exponential.py +++ b/skpro/distributions/exponential.py @@ -53,6 +53,55 @@ class Exponential(_ScipyAdapter): "broadcast_init": "on", } + # documentation hooks for formula injection + _pdf_formula_doc = r""" + The probability density function is given by: + + .. math:: + f(x) = \lambda \exp(-\lambda x), \quad x \ge 0 + """ + + _log_pdf_formula_doc = r""" + The log-density is given by: + + .. math:: + \log f(x) = \log(\lambda) - \lambda x, \quad x \ge 0 + """ + + _cdf_formula_doc = r""" + The cumulative distribution function is given by: + + .. math:: + F(x) = 1 - \exp(-\lambda x), \quad x \ge 0 + """ + _ppf_formula_doc = r""" + The quantile function (inverse cdf) is: + + .. math:: + F^{-1}(p; \lambda) = -\frac{\ln(1 - p)}{\lambda} + """ + + _mean_formula_doc = r""" + The expected value is: + + .. math:: + \mathbb{E}[X] = \lambda^{-1} + """ + + _var_formula_doc = r""" + The variance is: + + .. math:: + \text{Var}(X) = \lambda^{-2} + """ + + _energy_formula_doc = r""" + The analytical self-energy is: + + .. math:: + \mathbb{E}[|X - Y|] = \lambda^{-1} + """ + def __init__(self, rate, index=None, columns=None): self.rate = rate diff --git a/skpro/distributions/laplace.py b/skpro/distributions/laplace.py index cefb55eb7..414bd5c9c 100644 --- a/skpro/distributions/laplace.py +++ b/skpro/distributions/laplace.py @@ -54,6 +54,60 @@ class Laplace(BaseDistribution): "broadcast_init": "on", } + # documentation hooks for formula injection + _pdf_formula_doc = r""" + The probability density function is given by: + + .. math:: + f(x) = \frac{1}{2b} \exp \left( - \frac{|x - \mu|}{b} \right) + """ + + _log_pdf_formula_doc = r""" + The log-density is given by: + + .. math:: + \log f(x) = - \log(2b) - \frac{|x - \mu|}{b} + """ + + _cdf_formula_doc = r""" + The cumulative distribution function is given by: + + .. math:: + F(x) = + \begin{cases} + \frac{1}{2} \exp \left( \frac{x - \mu}{b} \right), & x < \mu \\ + 1 - \frac{1}{2} \exp \left( - \frac{x - \mu}{b} \right), & x \geq \mu + \end{cases} + """ + + _ppf_formula_doc = r""" + The quantile function (inverse cdf) is: + + .. math:: + F^{-1}(p; \mu, b) = \mu - b \operatorname{sgn}(p - 0.5) \ln(1 - 2|p - 0.5|) + """ + + _mean_formula_doc = r""" + The expected value is: + + .. math:: + \mathbb{E}[X] = \mu + """ + + _var_formula_doc = r""" + The variance is: + + .. math:: + \text{Var}(X) = 2b^2 + """ + + _energy_formula_doc = r""" + The analytical self-energy is: + + .. math:: + \mathbb{E}[|X - Y|] = \frac{3}{2}b + """ + def __init__(self, mu, scale, index=None, columns=None): self.mu = mu self.scale = scale diff --git a/skpro/distributions/normal.py b/skpro/distributions/normal.py index 519fffac2..ec16d8438 100644 --- a/skpro/distributions/normal.py +++ b/skpro/distributions/normal.py @@ -51,6 +51,58 @@ class Normal(BaseDistribution): "broadcast_init": "on", } + # documentation hooks for formula injection + _pdf_formula_doc = r""" + The probability density function is given by: + + .. math:: + f(x) = \frac{1}{\sigma \sqrt{2\pi}} \exp\left(-\frac{(x - \mu)^2} + {2\sigma^2}\right) + """ + + _log_pdf_formula_doc = r""" + The log-density is given by: + + .. math:: + \log f(x) = -\frac{(x - \mu)^2}{2\sigma^2} - \log(\sigma \sqrt{2\pi}) + """ + + _cdf_formula_doc = r""" + The cumulative distribution function is given by: + + .. math:: + F(x) = \frac{1}{2} \left[1 + \operatorname{erf}\left(\frac{x - \mu} + {\sigma\sqrt{2}}\right)\right] + """ + + _ppf_formula_doc = r""" + The quantile function (inverse cdf) is: + + .. math:: + F^{-1}(p; \mu, \sigma) = \mu + \sigma \sqrt{2} \operatorname{erf}^{-1}(2p - 1) + """ + + _mean_formula_doc = r""" + The expected value is: + + .. math:: + \mathbb{E}[X] = \mu + """ + + _var_formula_doc = r""" + The variance is: + + .. math:: + \text{Var}(X) = \sigma^2 + """ + + _energy_formula_doc = r""" + The analytical self-energy is: + + .. math:: + \mathbb{E}[|X - Y|] = \frac{2\sigma}{\sqrt{\pi}} + """ + def __init__(self, mu, sigma, index=None, columns=None): self.mu = mu self.sigma = sigma diff --git a/skpro/distributions/rayleigh.py b/skpro/distributions/rayleigh.py index 1503530e2..d557ec835 100644 --- a/skpro/distributions/rayleigh.py +++ b/skpro/distributions/rayleigh.py @@ -54,6 +54,57 @@ class Rayleigh(BaseDistribution): "broadcast_init": "on", } + # documentation hooks for formula injection + _pdf_formula_doc = r""" + The probability density function is given by: + + .. math:: + f(x; \sigma) = \frac{x}{\sigma^2} \exp\left(-\frac{x^2}{2\sigma^2}\right), + \quad x \geq 0 + """ + + _cdf_formula_doc = r""" + The cumulative distribution function is given by: + + .. math:: + F(x; \sigma) = 1 - \exp\left(-\frac{x^2}{2\sigma^2}\right), \quad x \geq 0 + """ + + _log_pdf_formula_doc = r""" + The log-density is given by: + + .. math:: + \log f(x) = \log(x) - 2\log(\sigma) - \frac{x^2}{2\sigma^2}, \quad x > 0 + """ + + _ppf_formula_doc = r""" + The quantile function (inverse cdf) is: + + .. math:: + F^{-1}(p; \sigma) = \sigma \sqrt{-2 \ln(1 - p)} + """ + + _mean_formula_doc = r""" + The expected value is: + + .. math:: + \mathbb{E}[X] = \sigma \sqrt{\frac{\pi}{2}} + """ + + _var_formula_doc = r""" + The variance is: + + .. math:: + \text{Var}(X) = \frac{4 - \pi}{2} \sigma^2 + """ + + _energy_formula_doc = r""" + The analytical self-energy is: + + .. math:: + \mathbb{E}[|X - Y|] = \sigma \sqrt{\pi} (\sqrt{2} - 1) + """ + def __init__(self, scale=1.0, index=None, columns=None): self.scale = scale super().__init__(index=index, columns=columns) diff --git a/skpro/distributions/tests/test_docstring_injection.py b/skpro/distributions/tests/test_docstring_injection.py new file mode 100644 index 000000000..41fb0121d --- /dev/null +++ b/skpro/distributions/tests/test_docstring_injection.py @@ -0,0 +1,85 @@ +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +"""Comprehensive tests for docstring formula injection architecture.""" + +import pytest + +from skpro.distributions.exponential import Exponential +from skpro.distributions.laplace import Laplace +from skpro.distributions.normal import Normal +from skpro.distributions.rayleigh import Rayleigh +from skpro.distributions.weibull import Weibull + +# 1. The methods we modified in BaseDistribution +TARGET_METHODS = [ + "pdf", + "log_pdf", + "cdf", + "pmf", + "log_pmf", + "ppf", + "surv", + "haz", + "energy", + "mean", + "var", + "pdfnorm", +] + +# 2. Classes where we implemented formula hooks +HOOKED_CLASSES = [Normal, Rayleigh, Exponential, Laplace] + +# 3. Classes where we DID NOT implement hooks (Control group) +UNHOOKED_CLASSES = [Weibull] + + +@pytest.mark.parametrize("dist_cls", HOOKED_CLASSES + UNHOOKED_CLASSES) +@pytest.mark.parametrize("method_name", TARGET_METHODS) +def test_placeholder_removal_universal(dist_cls, method_name): + """Verify {formula_doc} is NEVER visible to users in any distribution.""" + method = getattr(dist_cls, method_name) + doc = method.__doc__ + assert "{formula_doc}" not in doc, f"Leak in {dist_cls.__name__}.{method_name}" + + +@pytest.mark.parametrize("dist_cls", HOOKED_CLASSES) +def test_hooked_math_injection(dist_cls): + """Verify that distributions with hooks actually show LaTeX math.""" + # We check the primary method (pdf) + doc = dist_cls.pdf.__doc__ + + assert ".. math::" in doc, f"Math missing in {dist_cls.__name__}.pdf" + assert "f(x" in doc, f"Formula content missing in {dist_cls.__name__}.pdf" + + +@pytest.mark.parametrize("dist_cls", UNHOOKED_CLASSES) +@pytest.mark.parametrize("method_name", ["pdf", "mean", "energy"]) +def test_unhooked_clean_fallback(dist_cls, method_name): + """Verify that unhooked classes remain generic and clean.""" + doc = getattr(dist_cls, method_name).__doc__ + # Should not have math block + assert ( + ".. math::" not in doc + ), f"Ghost math block in {dist_cls.__name__}.{method_name}" + # Should have the original generic preamble + assert "with the distribution of" in doc + + +def test_wrapper_execution_safety(): + """Verify wrapping doesn't break execution logic.""" + # Using Rayleigh as the test subject + dist = Rayleigh.create_test_instance() + try: + dist.pdf(1.0) + dist.mean() + dist.energy() + except Exception as e: + pytest.fail(f"Architecture broke execution of Rayleigh: {e}") + + +def test_metadata_integrity_full_check(): + """Verify that functools.wraps works for all 12 methods on Rayleigh.""" + for method_name in TARGET_METHODS: + method = getattr(Rayleigh, method_name) + assert method.__name__ == method_name + # Check that it points to the correct base module + assert "skpro.distributions.base" in method.__module__