Skip to content

Commit d5d618c

Browse files
Refactor compute_bootci to use scipy.stats.bootstrap (#505)
* Update minimum scipy version to 1.10 * Use scipy.bootstrap in compute_bootci
1 parent f08cc88 commit d5d618c

File tree

4 files changed

+79
-143
lines changed

4 files changed

+79
-143
lines changed

.github/workflows/pytest.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,15 @@ jobs:
4949
deps:
5050
# Minimal supported versions and increasing gradually
5151
- python-version: "3.10"
52-
scipy: "scipy==1.8.0"
52+
scipy: "scipy==1.10.0"
5353
numpy: "numpy==1.22.4"
5454
pandas: "pandas==2.1.1"
5555
statsmodels: "statsmodels==0.14.1"
5656
scikit-learn: "scikit-learn==1.2.2"
5757
label: "really-old-versions"
5858

5959
- python-version: "3.11"
60-
scipy: "scipy==1.9.3"
60+
scipy: "scipy==1.11.0"
6161
numpy: "numpy==1.25.2"
6262
pandas: "pandas==2.2.0"
6363
statsmodels: "statsmodels==0.14.3"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ dependencies = [
3333
"pandas>=2.1.1",
3434
"pandas_flavor",
3535
"scikit-learn>=1.2.2",
36-
"scipy>=1.8.0",
36+
"scipy>=1.10.0",
3737
"seaborn",
3838
"statsmodels>=0.14.1",
3939
"tabulate",

src/pingouin/effsize.py

Lines changed: 69 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def compute_bootci(
195195
x,
196196
y=None,
197197
func=None,
198-
method="cper",
198+
method="bca",
199199
paired=False,
200200
confidence=0.95,
201201
n_boot=2000,
@@ -224,9 +224,10 @@ def compute_bootci(
224224
method : str
225225
Method to compute the confidence intervals (see Notes):
226226
227-
* ``'cper'``: Bias-corrected percentile method (default)
228-
* ``'norm'``: Normal approximation with bootstrapped bias and standard error
227+
* ``'bca'``: Bias-corrected and accelerated (BCa, default)
229228
* ``'per'``: Simple percentile
229+
* ``'basic'``: Basic (pivotal) method
230+
* ``'norm'``: Normal approximation with bootstrapped bias and standard error
230231
paired : boolean
231232
Indicates whether x and y are paired or not. For example, for correlation functions or
232233
paired T-test, x and y are assumed to be paired. Pingouin will resample the pairs
@@ -251,24 +252,18 @@ def compute_bootci(
251252
252253
Notes
253254
-----
254-
Results have been tested against the
255-
`bootci <https://www.mathworks.com/help/stats/bootci.html>`_ Matlab function.
255+
This function uses :py:func:`scipy.stats.bootstrap` under the hood. Requires SciPy >= 1.10.
256256
257-
Since version 1.7, SciPy also includes a built-in bootstrap function
258-
:py:func:`scipy.stats.bootstrap`. The SciPy implementation has two advantages over Pingouin: it
259-
is faster when using ``vectorized=True``, and it supports the bias-corrected and accelerated
260-
(BCa) confidence intervals for univariate functions. However, unlike Pingouin, it does not
261-
return the bootstrap distribution.
257+
The bias-corrected and accelerated method (``bca``, default) corrects for both bias and
258+
skewness of the bootstrap distribution using jackknife resampling.
262259
263260
The percentile bootstrap method (``per``) is defined as the
264261
:math:`100 \\times \\frac{\\alpha}{2}` and :math:`100 \\times \\frac{1 - \\alpha}{2}`
265262
percentiles of the distribution of :math:`\\theta` estimates obtained from resampling, where
266263
:math:`\\alpha` is the level of significance (1 - confidence, default = 0.05 for 95% CIs).
267264
268-
The bias-corrected percentile method (``cper``) corrects for bias of the bootstrap
269-
distribution. This method is different from the BCa method — the default in Matlab and SciPy —
270-
which corrects for both bias and skewness of the bootstrap distribution using jackknife
271-
resampling.
265+
The basic (pivotal) method (``basic``) reflects the bootstrap distribution around the
266+
observed statistic.
272267
273268
The normal approximation method (``norm``) calculates the confidence intervals with the
274269
standard normal distribution using bootstrapped bias and standard error.
@@ -286,7 +281,7 @@ def compute_bootci(
286281
287282
Examples
288283
--------
289-
1. Bootstrapped 95% confidence interval of a Pearson correlation
284+
1. Bootstrapped 95% BCa confidence interval of a Pearson correlation
290285
291286
>>> import pingouin as pg
292287
>>> import numpy as np
@@ -296,60 +291,36 @@ def compute_bootci(
296291
>>> stat = np.corrcoef(x, y)[0][1]
297292
>>> ci = pg.compute_bootci(x, y, func="pearson", paired=True, seed=42, decimals=4)
298293
>>> print(round(stat, 4), ci)
299-
0.0945 [-0.098 0.2738]
300-
301-
Let's compare to SciPy's built-in bootstrap function
302-
303-
>>> from scipy.stats import bootstrap
304-
>>> bt_scipy = bootstrap(
305-
... data=(x, y),
306-
... statistic=lambda x, y: np.corrcoef(x, y)[0][1],
307-
... method="basic",
308-
... vectorized=False,
309-
... n_resamples=2000,
310-
... paired=True,
311-
... random_state=42,
312-
... )
313-
>>> np.round(bt_scipy.confidence_interval, 4)
314-
array([-0.0952, 0.2883])
294+
0.0945 [-0.0986 0.2868]
315295
316-
2. Bootstrapped 95% confidence interval of a Cohen d
296+
2. Bootstrapped 95% BCa confidence interval of a Cohen d
317297
318298
>>> stat = pg.compute_effsize(x, y, eftype="cohen")
319299
>>> ci = pg.compute_bootci(x, y, func="cohen", seed=42, decimals=3)
320300
>>> print(round(stat, 4), ci)
321-
0.7009 [0.403 1.009]
301+
0.7009 [0.413 1.01 ]
322302
323-
3. Bootstrapped confidence interval of a standard deviation (univariate)
303+
3. Bootstrapped BCa confidence interval of a standard deviation (univariate)
324304
325305
>>> import numpy as np
326306
>>> stat = np.std(x, ddof=1)
327307
>>> ci = pg.compute_bootci(x, func="std", seed=123)
328308
>>> print(round(stat, 4), ci)
329-
1.5534 [1.38 1.8 ]
309+
1.5534 [1.39 1.81]
330310
331-
Compare to SciPy's built-in bootstrap function, which returns the bias-corrected and
332-
accelerated CIs (see Notes).
333-
334-
>>> def std(x, axis):
335-
... return np.std(x, ddof=1, axis=axis)
336-
>>> bt_scipy = bootstrap(data=(x,), statistic=std, n_resamples=2000, random_state=123)
337-
>>> np.round(bt_scipy.confidence_interval, 2)
338-
array([1.39, 1.81])
339-
340-
Changing the confidence intervals type in Pingouin
311+
Changing the confidence intervals method
341312
342313
>>> pg.compute_bootci(x, func="std", seed=123, method="norm")
343314
array([1.37, 1.76])
344315
345316
>>> pg.compute_bootci(x, func="std", seed=123, method="percentile")
346-
array([1.35, 1.75])
317+
array([1.36, 1.75])
347318
348319
4. Bootstrapped confidence interval using a custom univariate function
349320
350321
>>> from scipy.stats import skew
351322
>>> round(skew(x), 4), pg.compute_bootci(x, func=skew, n_boot=10000, seed=123)
352-
(-0.137, array([-0.55, 0.32]))
323+
(-0.137, array([-0.51, 0.38]))
353324
354325
5. Bootstrapped confidence interval using a custom bivariate function. Here, x and y are not
355326
paired and can therefore have different sizes.
@@ -368,21 +339,31 @@ def compute_bootci(
368339
... f"The bootstrap distribution has {bt.size} samples. The mean and standard "
369340
... f"{bt.mean():.4f} ± {bt.std():.4f}"
370341
... )
371-
The bootstrap distribution has 10000 samples. The mean and standard 0.8807 ± 0.1704
342+
The bootstrap distribution has 10000 samples. The mean and standard 0.8792 ± 0.1707
372343
"""
373344
from inspect import isfunction, isroutine
374345

346+
from scipy.stats import bootstrap as scipy_bootstrap
375347
from scipy.stats import norm
376348

377349
# Check other arguments
378350
assert isinstance(confidence, float)
379351
assert 0 < confidence < 1, "confidence must be between 0 and 1."
380-
assert method in ["norm", "normal", "percentile", "per", "cpercentile", "cper"]
352+
valid_methods = [
353+
"norm",
354+
"normal",
355+
"percentile",
356+
"per",
357+
"bca",
358+
"BCa",
359+
"basic",
360+
]
361+
assert method in valid_methods, f"method must be one of {valid_methods}."
381362
assert isfunction(func) or isinstance(func, str) or isroutine(func), (
382363
"func must be a function (e.g. np.mean, custom function) or a string (e.g. 'pearson'). "
383364
"See documentation for more details."
384365
)
385-
vectorizable = False
366+
vectorized = False
386367

387368
# Check x
388369
x = np.asarray(x)
@@ -423,75 +404,63 @@ def func(x, y):
423404
return compute_effsize(x, y, paired=paired, eftype=func_str)
424405

425406
elif func == "mean":
426-
vectorizable = True
407+
vectorized = True
427408

428-
def func(x):
429-
return np.mean(x, axis=0)
409+
def func(x, axis=-1):
410+
return np.mean(x, axis=axis)
430411

431412
elif func == "std":
432-
vectorizable = True
413+
vectorized = True
433414

434-
def func(x):
435-
return np.std(x, ddof=1, axis=0)
415+
def func(x, axis=-1):
416+
return np.std(x, ddof=1, axis=axis)
436417

437418
elif func == "var":
438-
vectorizable = True
419+
vectorized = True
439420

440-
def func(x):
441-
return np.var(x, ddof=1, axis=0)
421+
def func(x, axis=-1):
422+
return np.var(x, ddof=1, axis=axis)
442423

443424
else:
444425
raise ValueError("Function string not recognized.")
445426

446-
# Bootstrap
447-
bootstat = np.empty(n_boot)
448-
rng = np.random.default_rng(seed) # Random seed
449-
boot_x = rng.choice(np.arange(nx), size=(nx, n_boot), replace=True)
450-
451-
if y is not None:
452-
reference = func(x, y)
453-
if paired:
454-
for i in range(n_boot):
455-
# Note that here we use a bootstrapping procedure with replacement
456-
# of all the pairs (Xi, Yi). This is NOT suited for
457-
# hypothesis testing such as p-value estimation). Instead, for the
458-
# latter, one must only shuffle the Y values while keeping the X
459-
# values constant, i.e.:
460-
# >>> boot_x = rng.random_sample((n_boot, n)).argsort(axis=1)
461-
# >>> for i in range(n_boot):
462-
# >>> bootstat[i] = func(x, y[boot_x[i, :]])
463-
bootstat[i] = func(x[boot_x[:, i]], y[boot_x[:, i]])
464-
else:
465-
boot_y = rng.choice(np.arange(ny), size=(ny, n_boot), replace=True)
466-
for i in range(n_boot):
467-
bootstat[i] = func(x[boot_x[:, i]], y[boot_y[:, i]])
427+
# Determine scipy bootstrap method
428+
use_custom_ci = method in ["norm", "normal"]
429+
if method in ["bca", "BCa"]:
430+
_scipy_method = "BCa"
431+
elif method in ["per", "percentile"]:
432+
_scipy_method = "percentile"
433+
elif method == "basic":
434+
_scipy_method = "basic"
468435
else:
469-
reference = func(x)
470-
if vectorizable:
471-
bootstat = func(x[boot_x])
472-
else:
473-
for i in range(n_boot):
474-
bootstat[i] = func(x[boot_x[:, i]])
436+
_scipy_method = "percentile"
437+
438+
# Run scipy bootstrap
439+
data = (x, y) if y is not None else (x,)
440+
_paired = paired if y is not None else False
441+
boot_result = scipy_bootstrap(
442+
data=data,
443+
statistic=func,
444+
n_resamples=n_boot,
445+
confidence_level=confidence,
446+
method=_scipy_method,
447+
paired=_paired,
448+
vectorized=vectorized,
449+
random_state=seed,
450+
)
451+
bootstat = boot_result.bootstrap_distribution
475452

476453
# CONFIDENCE INTERVALS
477-
# See Matlab bootci function
478-
alpha = (1 - confidence) / 2
479-
if method in ["norm", "normal"]:
454+
if use_custom_ci:
480455
# Normal approximation
481-
za = norm.ppf(alpha) # = 1.96
456+
alpha = (1 - confidence) / 2
457+
reference = func(x) if y is None else func(x, y)
458+
za = norm.ppf(alpha)
482459
se = np.std(bootstat, ddof=1)
483460
bias = np.mean(bootstat - reference)
484461
ci = np.array([reference - bias + se * za, reference - bias - se * za])
485-
elif method in ["percentile", "per"]:
486-
# Simple percentile
487-
interval = 100 * np.array([alpha, 1 - alpha])
488-
ci = np.percentile(bootstat, interval)
489-
pass
490462
else:
491-
# Bias-corrected percentile bootstrap
492-
from pingouin.regression import _bias_corrected_ci
493-
494-
ci = _bias_corrected_ci(bootstat, reference, alpha=(1 - confidence))
463+
ci = np.array([boot_result.confidence_interval.low, boot_result.confidence_interval.high])
495464

496465
ci = np.round(ci, decimals)
497466

0 commit comments

Comments
 (0)