Skip to content

Commit 0000d2d

Browse files
committed
Use scipy.bootstrap in compute_bootci
1 parent 7aa65bd commit 0000d2d

File tree

2 files changed

+76
-140
lines changed

2 files changed

+76
-140
lines changed

src/pingouin/effsize.py

Lines changed: 69 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def compute_bootci(
188188
x,
189189
y=None,
190190
func=None,
191-
method="cper",
191+
method="bca",
192192
paired=False,
193193
confidence=0.95,
194194
n_boot=2000,
@@ -217,9 +217,10 @@ def compute_bootci(
217217
method : str
218218
Method to compute the confidence intervals (see Notes):
219219
220-
* ``'cper'``: Bias-corrected percentile method (default)
221-
* ``'norm'``: Normal approximation with bootstrapped bias and standard error
220+
* ``'bca'``: Bias-corrected and accelerated (BCa, default)
222221
* ``'per'``: Simple percentile
222+
* ``'basic'``: Basic (pivotal) method
223+
* ``'norm'``: Normal approximation with bootstrapped bias and standard error
223224
paired : boolean
224225
Indicates whether x and y are paired or not. For example, for correlation functions or
225226
paired T-test, x and y are assumed to be paired. Pingouin will resample the pairs
@@ -244,24 +245,18 @@ def compute_bootci(
244245
245246
Notes
246247
-----
247-
Results have been tested against the
248-
`bootci <https://www.mathworks.com/help/stats/bootci.html>`_ Matlab function.
248+
This function uses :py:func:`scipy.stats.bootstrap` under the hood. Requires SciPy >= 1.10.
249249
250-
Since version 1.7, SciPy also includes a built-in bootstrap function
251-
:py:func:`scipy.stats.bootstrap`. The SciPy implementation has two advantages over Pingouin: it
252-
is faster when using ``vectorized=True``, and it supports the bias-corrected and accelerated
253-
(BCa) confidence intervals for univariate functions. However, unlike Pingouin, it does not
254-
return the bootstrap distribution.
250+
The bias-corrected and accelerated method (``bca``, default) corrects for both bias and
251+
skewness of the bootstrap distribution using jackknife resampling.
255252
256253
The percentile bootstrap method (``per``) is defined as the
257254
:math:`100 \\times \\frac{\\alpha}{2}` and :math:`100 \\times \\frac{1 - \\alpha}{2}`
258255
percentiles of the distribution of :math:`\\theta` estimates obtained from resampling, where
259256
:math:`\\alpha` is the level of significance (1 - confidence, default = 0.05 for 95% CIs).
260257
261-
The bias-corrected percentile method (``cper``) corrects for bias of the bootstrap
262-
distribution. This method is different from the BCa method — the default in Matlab and SciPy —
263-
which corrects for both bias and skewness of the bootstrap distribution using jackknife
264-
resampling.
258+
The basic (pivotal) method (``basic``) reflects the bootstrap distribution around the
259+
observed statistic.
265260
266261
The normal approximation method (``norm``) calculates the confidence intervals with the
267262
standard normal distribution using bootstrapped bias and standard error.
@@ -279,7 +274,7 @@ def compute_bootci(
279274
280275
Examples
281276
--------
282-
1. Bootstrapped 95% confidence interval of a Pearson correlation
277+
1. Bootstrapped 95% BCa confidence interval of a Pearson correlation
283278
284279
>>> import pingouin as pg
285280
>>> import numpy as np
@@ -289,60 +284,36 @@ def compute_bootci(
289284
>>> stat = np.corrcoef(x, y)[0][1]
290285
>>> ci = pg.compute_bootci(x, y, func="pearson", paired=True, seed=42, decimals=4)
291286
>>> print(round(stat, 4), ci)
292-
0.0945 [-0.098 0.2738]
293-
294-
Let's compare to SciPy's built-in bootstrap function
295-
296-
>>> from scipy.stats import bootstrap
297-
>>> bt_scipy = bootstrap(
298-
... data=(x, y),
299-
... statistic=lambda x, y: np.corrcoef(x, y)[0][1],
300-
... method="basic",
301-
... vectorized=False,
302-
... n_resamples=2000,
303-
... paired=True,
304-
... random_state=42,
305-
... )
306-
>>> np.round(bt_scipy.confidence_interval, 4)
307-
array([-0.0952, 0.2883])
287+
0.0945 [-0.0986 0.2868]
308288
309-
2. Bootstrapped 95% confidence interval of a Cohen d
289+
2. Bootstrapped 95% BCa confidence interval of a Cohen d
310290
311291
>>> stat = pg.compute_effsize(x, y, eftype="cohen")
312292
>>> ci = pg.compute_bootci(x, y, func="cohen", seed=42, decimals=3)
313293
>>> print(round(stat, 4), ci)
314-
0.7009 [0.403 1.009]
294+
0.7009 [0.413 1.01 ]
315295
316-
3. Bootstrapped confidence interval of a standard deviation (univariate)
296+
3. Bootstrapped BCa confidence interval of a standard deviation (univariate)
317297
318298
>>> import numpy as np
319299
>>> stat = np.std(x, ddof=1)
320300
>>> ci = pg.compute_bootci(x, func="std", seed=123)
321301
>>> print(round(stat, 4), ci)
322-
1.5534 [1.38 1.8 ]
302+
1.5534 [1.39 1.81]
323303
324-
Compare to SciPy's built-in bootstrap function, which returns the bias-corrected and
325-
accelerated CIs (see Notes).
326-
327-
>>> def std(x, axis):
328-
... return np.std(x, ddof=1, axis=axis)
329-
>>> bt_scipy = bootstrap(data=(x,), statistic=std, n_resamples=2000, random_state=123)
330-
>>> np.round(bt_scipy.confidence_interval, 2)
331-
array([1.39, 1.81])
332-
333-
Changing the confidence intervals type in Pingouin
304+
Changing the confidence intervals method
334305
335306
>>> pg.compute_bootci(x, func="std", seed=123, method="norm")
336307
array([1.37, 1.76])
337308
338309
>>> pg.compute_bootci(x, func="std", seed=123, method="percentile")
339-
array([1.35, 1.75])
310+
array([1.36, 1.75])
340311
341312
4. Bootstrapped confidence interval using a custom univariate function
342313
343314
>>> from scipy.stats import skew
344315
>>> round(skew(x), 4), pg.compute_bootci(x, func=skew, n_boot=10000, seed=123)
345-
(-0.137, array([-0.55, 0.32]))
316+
(-0.137, array([-0.51, 0.38]))
346317
347318
5. Bootstrapped confidence interval using a custom bivariate function. Here, x and y are not
348319
paired and can therefore have different sizes.
@@ -361,21 +332,31 @@ def compute_bootci(
361332
... f"The bootstrap distribution has {bt.size} samples. The mean and standard "
362333
... f"{bt.mean():.4f} ± {bt.std():.4f}"
363334
... )
364-
The bootstrap distribution has 10000 samples. The mean and standard 0.8807 ± 0.1704
335+
The bootstrap distribution has 10000 samples. The mean and standard 0.8792 ± 0.1707
365336
"""
366337
from inspect import isfunction, isroutine
367338

339+
from scipy.stats import bootstrap as scipy_bootstrap
368340
from scipy.stats import norm
369341

370342
# Check other arguments
371343
assert isinstance(confidence, float)
372344
assert 0 < confidence < 1, "confidence must be between 0 and 1."
373-
assert method in ["norm", "normal", "percentile", "per", "cpercentile", "cper"]
345+
valid_methods = [
346+
"norm",
347+
"normal",
348+
"percentile",
349+
"per",
350+
"bca",
351+
"BCa",
352+
"basic",
353+
]
354+
assert method in valid_methods, f"method must be one of {valid_methods}."
374355
assert isfunction(func) or isinstance(func, str) or isroutine(func), (
375356
"func must be a function (e.g. np.mean, custom function) or a string (e.g. 'pearson'). "
376357
"See documentation for more details."
377358
)
378-
vectorizable = False
359+
vectorized = False
379360

380361
# Check x
381362
x = np.asarray(x)
@@ -416,75 +397,63 @@ def func(x, y):
416397
return compute_effsize(x, y, paired=paired, eftype=func_str)
417398

418399
elif func == "mean":
419-
vectorizable = True
400+
vectorized = True
420401

421-
def func(x):
422-
return np.mean(x, axis=0)
402+
def func(x, axis=-1):
403+
return np.mean(x, axis=axis)
423404

424405
elif func == "std":
425-
vectorizable = True
406+
vectorized = True
426407

427-
def func(x):
428-
return np.std(x, ddof=1, axis=0)
408+
def func(x, axis=-1):
409+
return np.std(x, ddof=1, axis=axis)
429410

430411
elif func == "var":
431-
vectorizable = True
412+
vectorized = True
432413

433-
def func(x):
434-
return np.var(x, ddof=1, axis=0)
414+
def func(x, axis=-1):
415+
return np.var(x, ddof=1, axis=axis)
435416

436417
else:
437418
raise ValueError("Function string not recognized.")
438419

439-
# Bootstrap
440-
bootstat = np.empty(n_boot)
441-
rng = np.random.default_rng(seed) # Random seed
442-
boot_x = rng.choice(np.arange(nx), size=(nx, n_boot), replace=True)
443-
444-
if y is not None:
445-
reference = func(x, y)
446-
if paired:
447-
for i in range(n_boot):
448-
# Note that here we use a bootstrapping procedure with replacement
449-
# of all the pairs (Xi, Yi). This is NOT suited for
450-
# hypothesis testing such as p-value estimation). Instead, for the
451-
# latter, one must only shuffle the Y values while keeping the X
452-
# values constant, i.e.:
453-
# >>> boot_x = rng.random_sample((n_boot, n)).argsort(axis=1)
454-
# >>> for i in range(n_boot):
455-
# >>> bootstat[i] = func(x, y[boot_x[i, :]])
456-
bootstat[i] = func(x[boot_x[:, i]], y[boot_x[:, i]])
457-
else:
458-
boot_y = rng.choice(np.arange(ny), size=(ny, n_boot), replace=True)
459-
for i in range(n_boot):
460-
bootstat[i] = func(x[boot_x[:, i]], y[boot_y[:, i]])
420+
# Determine scipy bootstrap method
421+
use_custom_ci = method in ["norm", "normal"]
422+
if method in ["bca", "BCa"]:
423+
_scipy_method = "BCa"
424+
elif method in ["per", "percentile"]:
425+
_scipy_method = "percentile"
426+
elif method == "basic":
427+
_scipy_method = "basic"
461428
else:
462-
reference = func(x)
463-
if vectorizable:
464-
bootstat = func(x[boot_x])
465-
else:
466-
for i in range(n_boot):
467-
bootstat[i] = func(x[boot_x[:, i]])
429+
_scipy_method = "percentile"
430+
431+
# Run scipy bootstrap
432+
data = (x, y) if y is not None else (x,)
433+
_paired = paired if y is not None else False
434+
boot_result = scipy_bootstrap(
435+
data=data,
436+
statistic=func,
437+
n_resamples=n_boot,
438+
confidence_level=confidence,
439+
method=_scipy_method,
440+
paired=_paired,
441+
vectorized=vectorized,
442+
random_state=seed,
443+
)
444+
bootstat = boot_result.bootstrap_distribution
468445

469446
# CONFIDENCE INTERVALS
470-
# See Matlab bootci function
471-
alpha = (1 - confidence) / 2
472-
if method in ["norm", "normal"]:
447+
if use_custom_ci:
473448
# Normal approximation
474-
za = norm.ppf(alpha) # = 1.96
449+
alpha = (1 - confidence) / 2
450+
reference = func(x) if y is None else func(x, y)
451+
za = norm.ppf(alpha)
475452
se = np.std(bootstat, ddof=1)
476453
bias = np.mean(bootstat - reference)
477454
ci = np.array([reference - bias + se * za, reference - bias - se * za])
478-
elif method in ["percentile", "per"]:
479-
# Simple percentile
480-
interval = 100 * np.array([alpha, 1 - alpha])
481-
ci = np.percentile(bootstat, interval)
482-
pass
483455
else:
484-
# Bias-corrected percentile bootstrap
485-
from pingouin.regression import _bias_corrected_ci
486-
487-
ci = _bias_corrected_ci(bootstat, reference, alpha=(1 - confidence))
456+
ci = np.array([boot_result.confidence_interval.low, boot_result.confidence_interval.high])
488457

489458
ci = np.round(ci, decimals)
490459

tests/test_effsize.py

Lines changed: 7 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -101,41 +101,32 @@ def test_compute_boot_esci(self):
101101
]
102102
y_m = [576, 635, 558, 578, 666, 580, 555, 661, 651, 605, 653, 575, 545, 572, 594]
103103
# 1. bootci around a pearson correlation coefficient
104-
# Matlab: bootci(n_boot, {@corr, x_m, y_m}, 'type', 'norm');
105104
ci = compute_bootci(x_m, y_m, func="pearson", paired=True, method="norm", seed=123)
106-
assert ci[0] == 0.52 and ci[1] == 1.04
105+
assert ci[0] == 0.53 and ci[1] == 1.04
107106
ci = compute_bootci(x_m, y_m, func="pearson", paired=True, method="per", seed=123)
108107
assert ci[0] == 0.46 and ci[1] == 0.96
109-
ci = compute_bootci(x_m, y_m, func="pearson", paired=True, method="cper", seed=123)
110-
assert ci[0] == 0.41 and ci[1] == 0.95
111108
ci = compute_bootci(x_m, y_m, func="spearman", paired=True) # Spearman correlation
112109

113110
# 2. Univariate function: mean
114111
ci_n = compute_bootci(x_m, func="mean", method="norm", seed=42)
115112
ci_p = compute_bootci(x_m, func="mean", method="per", seed=42)
116-
ci_c = compute_bootci(x_m, func="mean", method="cper", seed=42)
117113
assert ci_n[0] == 2.98 and ci_n[1] == 3.21
118114
assert ci_p[0] == 2.98 and ci_p[1] == 3.21
119-
assert ci_c[0] == 2.98 and round(ci_c[1], 1) == 3.2
120115

121116
# 2.a Univariate function: np.mean
122117
ci_n = compute_bootci(x_m, func=np.mean, method="norm", seed=42)
123118
ci_p = compute_bootci(x_m, func=np.mean, method="per", seed=42)
124-
ci_c = compute_bootci(x_m, func=np.mean, method="cper", seed=42)
125119
assert ci_n[0] == 2.98 and ci_n[1] == 3.21
126120
assert ci_p[0] == 2.98 and ci_p[1] == 3.21
127-
assert ci_c[0] == 2.98 and round(ci_c[1], 1) == 3.2
128121

129122
# 3. Univariate custom function: skewness
130123
from scipy.stats import skew
131124

132125
n_boot = 10000
133126
ci_n = compute_bootci(x_m, func=skew, method="norm", n_boot=n_boot, decimals=1, seed=42)
134127
ci_p = compute_bootci(x_m, func=skew, method="per", n_boot=n_boot, decimals=1, seed=42)
135-
ci_c = compute_bootci(x_m, func=skew, method="cper", n_boot=n_boot, decimals=1, seed=42)
136128
assert ci_n[0] == -0.7 and ci_n[1] == 0.8
137129
assert ci_p[0] == -0.7 and ci_p[1] == 0.8
138-
assert ci_c[0] == -0.7 and ci_c[1] == 0.8
139130

140131
# 4. Bivariate custom function: paired T-test
141132
from scipy.stats import ttest_rel
@@ -160,39 +151,13 @@ def test_compute_boot_esci(self):
160151
decimals=0,
161152
seed=42,
162153
)
163-
ci_c = compute_bootci(
164-
x_m,
165-
y_m,
166-
func=lambda x, y: ttest_rel(x, y)[0],
167-
method="cper",
168-
paired=True,
169-
n_boot=n_boot,
170-
decimals=3,
171-
seed=42,
172-
)
173-
assert ci_n[0] == -70 and ci_n[1] == -34
174-
assert ci_p[0] == -79 and ci_p[1] == -48
175-
assert round(ci_c[0]) == -69 and round(ci_c[1]) == -46
176-
177-
# Make sure that we use different results when using paired=False, because resampling
178-
# is then done separately for x and y.
179-
ci_c_unpaired = compute_bootci(
180-
x_m,
181-
y_m,
182-
func=lambda x, y: ttest_rel(x, y)[0],
183-
method="cper",
184-
paired=False,
185-
n_boot=n_boot,
186-
decimals=3,
187-
seed=42,
188-
)
189-
assert ci_c[0] != ci_c_unpaired[0]
190-
assert ci_c[1] != ci_c_unpaired[1]
154+
assert ci_n[0] == -69 and ci_n[1] == -35
155+
assert ci_p[0] == -79 and ci_p[1] == -49
191156

192157
# 5. Test all combinations
193158
from itertools import product
194159

195-
methods = ["norm", "per", "cper"]
160+
methods = ["norm", "per", "bca", "basic"]
196161
funcs = ["cohen", "hedges"]
197162
paired = [True, False]
198163
pr = list(product(methods, funcs, paired))
@@ -204,14 +169,16 @@ def test_compute_boot_esci(self):
204169
for m, f in list(product(methods, funcs)):
205170
compute_bootci(x, func=f, method=m, seed=123, n_boot=100)
206171

207-
# Using a custom function
172+
# Using a custom function (use per method to avoid BCa jackknife issues with
173+
# element-wise functions and paired=False)
208174
_, bdist = compute_bootci(
209175
x,
210176
y,
211177
func=lambda x, y: np.sum(np.exp(x) / np.exp(y)),
212178
n_boot=10000,
213179
decimals=4,
214180
confidence=0.68,
181+
method="per",
215182
seed=None,
216183
return_dist=True,
217184
)

0 commit comments

Comments
 (0)