Skip to content

Commit 140bda9

Browse files
authored
Merge pull request #141 from PLN-team/sandwich-var
Fixing Sandwich variance - Issue #140
2 parents 08577cd + 9c4b19c commit 140bda9

File tree

15 files changed

+186
-162
lines changed

15 files changed

+186
-162
lines changed

.github/workflows/R-CMD-check.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ jobs:
1919
matrix:
2020
config:
2121
- {os: macOS-latest, r: 'release'}
22-
- {os: macOS-latest, r: 'oldrel-1'}
2322
- {os: windows-latest, r: 'release'}
2423
- {os: windows-latest, r: 'oldrel-1'}
2524
- {os: ubuntu-latest, r: 'release'}

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: PLNmodels
22
Title: Poisson Lognormal Models
3-
Version: 1.2.1
3+
Version: 1.2.2
44
Authors@R: c(
55
person("Julien", "Chiquet", , "julien.chiquet@inrae.fr", role = c("aut", "cre"),
66
comment = c(ORCID = "0000-0002-3629-3429")),

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Current version
22

3+
* fix sandwich variance estimation (PR #140)
34
* fix use of native pipe to ensure compatibility with R 3.6 (merge PR #125, fix #124)
45

56
# PLNmodels 1.2.0 (2024-03-05)

R/PLNfit-S3methods.R

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ sigma.PLNfit <- function(object, ...) {
156156
#' @description Extracts univariate standard errors for the estimated coefficient of B. Standard errors are computed from the (approximate) Fisher information matrix.
157157
#'
158158
#' @param object an R6 object with class PLNfit
159-
#' @param type string describing the type of variance approximation: "variational", "jackknife", "sandwich" (only for fixed covariance). Default is "variational".
159+
#' @param type string describing the type of variance approximation: "variational", "jackknife", "sandwich". Default is "sandwich".
160160
#' @param parameter string describing the target parameter: either B (regression coefficients) or Omega (inverse residual covariance)
161161
#'
162162
#' @seealso [vcov.PLNfit()] for the complete variance covariance estimation of the coefficient
@@ -166,16 +166,16 @@ sigma.PLNfit <- function(object, ...) {
166166
#' data(trichoptera)
167167
#' trichoptera <- prepare_data(trichoptera$Abundance, trichoptera$Covariate)
168168
#' myPLN <- PLN(Abundance ~ 1 + offset(log(Offset)), data = trichoptera,
169-
#' control = PLN_param(config_post = list(variational_var = TRUE)))
169+
#' control = PLN_param(config_post = list(sandwich_var = TRUE)))
170170
#' standard_error(myPLN)
171171
#' @export
172-
standard_error <- function(object, type = c("variational", "jackknife", "sandwich"), parameter = c("B", "Omega")) {
172+
standard_error <- function(object, type = c("sandwich", "variational", "jackknife"), parameter = c("B", "Omega")) {
173173
UseMethod("standard_error", object)
174174
}
175175

176176
#' @describeIn standard_error Component-wise standard errors of B in [`PLNfit`]
177177
#' @export
178-
standard_error.PLNfit <- function(object, type = c("variational", "jackknife", "bootstrap", "sandwich"), parameter = c("B", "Omega")) {
178+
standard_error.PLNfit <- function(object, type = c("sandwich", "variational", "jackknife", "bootstrap"), parameter = c("B", "Omega")) {
179179
type <- match.arg(type)
180180
par <- match.arg(parameter)
181181
if (type == "variational" & is.null(attr(object$model_par$B, "variance_variational")))
@@ -184,14 +184,14 @@ standard_error.PLNfit <- function(object, type = c("variational", "jackknife", "
184184
stop("Jackknife estimation not available: rerun by setting `jackknife = TRUE` in the control list.")
185185
if (type == "bootstrap" & is.null(attr(object$model_par$B, "variance_bootstrap")))
186186
stop("Bootstrap estimation not available: rerun by setting `bootstrap > 0` in the control list.")
187-
if (type == "sandwich")
188-
stop("Sandwich estimator is only available for fixed covariance / precision matrix.")
187+
if (type == "sandwich" & is.null(attr(object$model_par$B, "variance_sandwich")))
188+
stop("Sandwich estimator not available: rerun by setting `sandwich_var = TRUE` in the control list.")
189189
attr(object$model_par[[par]], paste0("variance_", type)) %>% sqrt()
190190
}
191191

192192
#' @describeIn standard_error Component-wise standard errors of B in [`PLNfit_fixedcov`]
193193
#' @export
194-
standard_error.PLNfit_fixedcov <- function(object, type = c("variational", "jackknife", "bootstrap", "sandwich"), parameter = c("B", "Omega")) {
194+
standard_error.PLNfit_fixedcov <- function(object, type = c("sandwich", "variational", "jackknife", "bootstrap"), parameter = c("B", "Omega")) {
195195
type <- match.arg(type)
196196
par <- match.arg(parameter)
197197
if (par == "Omega")
@@ -202,5 +202,7 @@ standard_error.PLNfit_fixedcov <- function(object, type = c("variational", "jack
202202
stop("Jackknife estimation not available: rerun by setting `jackknife = TRUE` in the control list.")
203203
if (type == "bootstrap" & is.null(attr(object$model_par$B, "variance_bootstrap")))
204204
stop("Bootstrap estimation not available: rerun by setting `bootstrap > 0` in the control list.")
205+
if (type == "sandwich" & is.null(attr(object$model_par$B, "variance_sandwich")))
206+
stop("Sandwich estimator not available: rerun by setting `sandwich_var = TRUE` in the control list.")
205207
attr(object$model_par[[par]], paste0("variance_", type)) %>% sqrt()
206208
}

R/PLNfit-class.R

Lines changed: 21 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ PLNfit <- R6Class(
191191
variance_variational = function(X, config = config_default_nlopt) {
192192
## Variance of B for n data points
193193
fisher <- Matrix::bdiag(lapply(1:self$p, function(j) {
194-
crossprod(X, private$A[, j] * X) # t(X) %*% diag(A[, i]) %*% X
194+
crossprod(X, private$A[, j] * X) # t(X) %*% diag(A[, j]) %*% X
195195
}))
196196
vcov_B <- tryCatch(Matrix::solve(fisher), error = function(e) {e})
197197
if (is(vcov_B, "error")) {
@@ -220,22 +220,11 @@ PLNfit <- R6Class(
220220

221221
compute_vcov_from_resamples = function(resamples){
222222
B_list = resamples %>% map("B")
223-
#print (B_list)
224223
vcov_B = lapply(seq(1, ncol(private$B)), function(B_col){
225224
param_ests_for_col = B_list %>% map(~.x[, B_col])
226225
param_ests_for_col = do.call(rbind, param_ests_for_col)
227-
#print (param_ests_for_col)
228226
row_vcov = cov(param_ests_for_col)
229227
})
230-
#print ("vcov blocks")
231-
#print (vcov_B)
232-
233-
#B_vcov <- resamples %>% map("B") %>% map(~( . )) %>% reduce(cov)
234-
235-
#var_jack <- jacks %>% map("B") %>% map(~( (. - B_jack)^2)) %>% reduce(`+`) %>%
236-
# `dimnames<-`(dimnames(private$B))
237-
#B_hat <- private$B[,] ## strips attributes while preserving names
238-
239228
vcov_B = Matrix::bdiag(vcov_B) %>% as.matrix()
240229

241230
rownames(vcov_B) <- colnames(vcov_B) <-
@@ -244,33 +233,32 @@ PLNfit <- R6Class(
244233
## Hack to make sure that species is first and varies slowest
245234
apply(1, paste0, collapse = "_")
246235

247-
#print (pheatmap::pheatmap(vcov_B, cluster_rows=FALSE, cluster_cols=FALSE))
248-
249-
250-
#names = lapply(bootstrapped_df$cov_mat, function(m){ colnames(m)}) %>% unlist()
251-
#rownames(bootstrapped_vhat) = names
252-
#colnames(bootstrapped_vhat) = names
253-
254236
vcov_B = methods::as(vcov_B, "dgCMatrix")
255237

256238
return(vcov_B)
257239
},
258240

241+
vcov_sandwich_B = function(Y, X) {
242+
vcov_sand <- get_sandwich_variance_B(Y, X, private$A,
243+
private$S, private$Sigma, diag(private$Omega)
244+
)
245+
attr(private$B, "vcov_sandwich") <- vcov_sand
246+
attr(private$B, "variance_sandwich") <- matrix(diag(vcov_sand), nrow = self$d, ncol = self$p,
247+
dimnames = dimnames(private$B))
248+
},
249+
259250
variance_jackknife = function(Y, X, O, w, config = config_default_nlopt) {
260251
jacks <- future.apply::future_lapply(seq_len(self$n), function(i) {
261252
data <- list(Y = Y[-i, , drop = FALSE],
262253
X = X[-i, , drop = FALSE],
263254
O = O[-i, , drop = FALSE],
264255
w = w[-i])
265256
args <- list(data = data,
266-
# params = list(B = private$B,
267-
# M = matrix(0, self$n-1, self$p),
268-
# S = private$S[-i, , drop = FALSE]),
269257
params = do.call(compute_PLN_starting_point, data),
270258
config = config)
271259
optim_out <- do.call(private$optimizer$main, args)
272260
optim_out[c("B", "Omega")]
273-
})
261+
}, future.seed = TRUE, future.scheduling = structure(TRUE, ordering = "random"))
274262

275263
B_jack <- jacks %>% map("B") %>% reduce(`+`) / self$n
276264
var_jack <- jacks %>% map("B") %>% map(~( (. - B_jack)^2)) %>% reduce(`+`) %>%
@@ -298,21 +286,17 @@ PLNfit <- R6Class(
298286
O = O[resample, , drop = FALSE],
299287
w = w[resample])
300288
if (config$backend == "torch") # Convert data to torch tensors
301-
data <- lapply(data, torch_tensor, device = config$device) # list with Y, X, O, w
302-
303-
#print (data$Y$device)
289+
data <- lapply(data, torch_tensor, device = config$device)
304290

305291
args <- list(data = data,
306-
# params = list(B = private$B, M = matrix(0,self$n,self$p), S = private$S[resample, ]),
307292
params = do.call(compute_PLN_starting_point, data),
308293
config = config)
309294
if (config$backend == "torch") # Convert data to torch tensors
310295
args$params <- lapply(args$params, torch_tensor, requires_grad = TRUE, device = config$device) # list with B, M, S
311296

312297
optim_out <- do.call(private$optimizer$main, args)
313-
#print (optim_out)
314298
optim_out[c("B", "Omega", "monitoring")]
315-
})
299+
}, future.seed = TRUE, future.scheduling = structure(TRUE, ordering = "random"))
316300

317301
B_boots <- boots %>% map("B") %>% reduce(`+`) / n_resamples
318302
attr(private$B, "variance_bootstrap") <-
@@ -455,7 +439,7 @@ PLNfit <- R6Class(
455439
#' * jackknife boolean indicating whether jackknife should be performed to evaluate bias and variance of the model parameters. Default is FALSE.
456440
#' * bootstrap integer indicating the number of bootstrap resamples generated to evaluate the variance of the model parameters. Default is 0 (inactivated).
457441
#' * variational_var boolean indicating whether variational Fisher information matrix should be computed to estimate the variance of the model parameters (highly underestimated). Default is FALSE.
458-
#' * rsquared boolean indicating whether approximation of R2 based on deviance should be computed. Default is TRUE
442+
#' * sandwich_var boolean indicating whether sandwich estimator should be computed to estimate the variance of the model parameters (highly underestimated). Default is FALSE.
459443
#' * trace integer for verbosity. should be > 1 to see output in post-treatments
460444
postTreatment = function(responses, covariates, offsets, weights = rep(1, nrow(responses)), config_post, config_optim, nullModel = NULL) {
461445
## PARAMATERS DIMNAMES
@@ -496,6 +480,11 @@ PLNfit <- R6Class(
496480
}
497481
private$variance_bootstrap(responses, covariates, offsets, weights, n_resamples=config_post$bootstrap, config = config_optim)
498482
}
483+
## 5. compute and store matrix of standard variances for B with sandwich correction approximation
484+
if (config_post$sandwich_var) {
485+
if(config_post$trace > 1) cat("\n\tComputing sandwich estimator of the variance...")
486+
private$vcov_sandwich_B(responses, covariates)
487+
}
499488
},
500489

501490
#' @description Predict position, scores or observations of new data.
@@ -920,25 +909,8 @@ PLNfit_fixedcov <- R6Class(
920909
optim_out <- do.call(private$optimizer$main, args)
921910
do.call(self$update, optim_out)
922911
private$Sigma <- solve(optim_out$Omega)
923-
},
924-
925-
#' @description Update R2, fisher and std_err fields after optimization
926-
#' @param config_post a list for controlling the post-treatments (optional bootstrap, jackknife, R2, etc.). See details
927-
#' @param config_optim a list for controlling the optimization parameter. See details
928-
#' @details The list of parameters `config` controls the post-treatment processing, with the following entries:
929-
#' * trace integer for verbosity. should be > 1 to see output in post-treatments
930-
#' * jackknife boolean indicating whether jackknife should be performed to evaluate bias and variance of the model parameters. Default is FALSE.
931-
#' * bootstrap integer indicating the number of bootstrap resamples generated to evaluate the variance of the model parameters. Default is 0 (inactivated).
932-
#' * variational_var boolean indicating whether variational Fisher information matrix should be computed to estimate the variance of the model parameters (highly underestimated). Default is FALSE.
933-
#' * rsquared boolean indicating whether approximation of R2 based on deviance should be computed. Default is TRUE
934-
postTreatment = function(responses, covariates, offsets, weights = rep(1, nrow(responses)), config_post, config_optim, nullModel = NULL) {
935-
super$postTreatment(responses, covariates, offsets, weights, config_post, config_optim, nullModel)
936-
## 6. compute and store matrix of standard variances for B with sandwich correction approximation
937-
if (config_post$sandwich_var) {
938-
if(config_post$trace > 1) cat("\n\tComputing sandwich estimator of the variance...")
939-
private$vcov_sandwich_B(responses, covariates)
940-
}
941912
}
913+
942914
),
943915
private = list(
944916
## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -976,39 +948,14 @@ PLNfit_fixedcov <- R6Class(
976948
config = config)
977949
optim_out <- do.call(private$optimizer$main, args)
978950
optim_out[c("B", "Omega")]
979-
}, future.seed = TRUE)
951+
}, future.seed = TRUE, future.scheduling = structure(TRUE, ordering = "random"))
980952

981953
B_jack <- jacks %>% map("B") %>% reduce(`+`) / self$n
982954
var_jack <- jacks %>% map("B") %>% map(~( (. - B_jack)^2)) %>% reduce(`+`) %>%
983955
`dimnames<-`(dimnames(private$B))
984956
B_hat <- private$B[,] ## strips attributes while preserving names
985957
attr(private$B, "bias") <- (self$n - 1) * (B_jack - B_hat)
986958
attr(private$B, "variance_jackknife") <- (self$n - 1) / self$n * var_jack
987-
},
988-
989-
vcov_sandwich_B = function(Y, X) {
990-
getMat_iCnB <- function(i) {
991-
a_i <- as.numeric(private$A[i, ])
992-
s2_i <- as.numeric(private$S[i, ]**2)
993-
# omega <- as.numeric(1/diag(private$Sigma))
994-
# diag_mat_i <- diag(1/a_i + s2_i^2 / (1 + s2_i * (a_i + omega)))
995-
diag_mat_i <- diag(1/a_i + .5 * s2_i^2)
996-
solve(private$Sigma + diag_mat_i)
997-
}
998-
YmA <- Y - private$A
999-
Dn <- matrix(0, self$d*self$p, self$d*self$p)
1000-
Cn <- matrix(0, self$d*self$p, self$d*self$p)
1001-
for (i in 1:self$n) {
1002-
xxt_i <- tcrossprod(X[i, ])
1003-
Cn <- Cn - kronecker(getMat_iCnB(i) , xxt_i) / (self$n)
1004-
Dn <- Dn + kronecker(tcrossprod(YmA[i,]), xxt_i) / (self$n)
1005-
}
1006-
Cn_inv <- solve(Cn)
1007-
dim_names <- dimnames(attr(private$B, "vcov_variational"))
1008-
vcov_sand <- ((Cn_inv %*% Dn %*% Cn_inv) / self$n) %>% `dimnames<-`(dim_names)
1009-
attr(private$B, "vcov_sandwich") <- vcov_sand
1010-
attr(private$B, "variance_sandwich") <- matrix(diag(vcov_sand), nrow = self$d, ncol = self$p,
1011-
dimnames = dimnames(private$B))
1012959
}
1013960
),
1014961
active = list(

R/RcppExports.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,7 @@ cpp_test_packing <- function() {
8989
.Call('_PLNmodels_cpp_test_packing', PACKAGE = 'PLNmodels')
9090
}
9191

92+
get_sandwich_variance_B <- function(Y, X, A, S, Sigma, Diag_Omega) {
93+
.Call('_PLNmodels_get_sandwich_variance_B', PACKAGE = 'PLNmodels', Y, X, A, S, Sigma, Diag_Omega)
94+
}
95+

R/utils.R

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,23 +55,26 @@ config_post_default_PLNLDA <-
5555
jackknife = FALSE,
5656
bootstrap = 0L,
5757
rsquared = TRUE,
58-
variational_var = FALSE
58+
variational_var = FALSE,
59+
sandwich_var = FALSE
5960
)
6061

6162
config_post_default_PLNPCA <-
6263
list(
6364
jackknife = FALSE,
6465
bootstrap = 0L,
6566
rsquared = TRUE,
66-
variational_var = FALSE
67+
variational_var = FALSE,
68+
sandwich_var = FALSE
6769
)
6870

6971
config_post_default_PLNmixture <-
7072
list(
7173
jackknife = FALSE,
7274
bootstrap = 0L,
7375
rsquared = TRUE,
74-
variational_var = FALSE
76+
variational_var = FALSE,
77+
sandwich_var = FALSE
7578
)
7679

7780
status_to_message <- function(status) {

inst/check/variance_estimates.R

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,68 @@
11
library(tidyverse)
22
library(PLNmodels)
3+
library(future.apply)
34
set.seed(1234)
45

5-
nb_cores <- 10
6-
options(future.fork.enable = TRUE)
6+
plan(multisession, workers = 10)
77

8-
params <- PLNmodels:::create_parameters(n = 50, p = 10, d = 1, depths = 1e3)
8+
rmse <- function(theta_hat, theta_star) {
9+
sqrt(mean((theta_hat - theta_star)^2))
10+
}
11+
12+
params <- PLNmodels:::create_parameters(n = 250, p = 10, d = 1, depths = 1e3)
913
X <- params$X
1014
B <- params$B
11-
Y <- rPLN(n = nrow(X), mu = X %*% B, Sigma = params$Sigma, depths = params$depths)
15+
conf <- list(variational_var = TRUE, jackknife = FALSE, bootstrap = 50, sandwich_var = TRUE)
16+
seq_n <- c(50, 100, 250)
17+
18+
one_simu <- function(s) {
19+
cat("\nsimu", s)
20+
do.call(rbind, lapply(seq_n, function(n) {
21+
X_sub <- X[1:n, , drop = FALSE]
22+
Y <- rPLN(n = n, mu = X_sub %*% B, Sigma = params$Sigma, depths = params$depths)
23+
data <- prepare_data(Y, X_sub, offset = "none")
24+
logO <- attr(Y, "offsets")
25+
model <- PLN(Abundance ~ 0 + . + offset(logO), data = data, control = PLN_param(trace = FALSE, config_post = conf))
26+
27+
B_hat <- coef(model)
28+
vcov_sandwich <- attr(coef(model), "vcov_sandwich")
29+
vcov_bootstrap <- attr(coef(model), "vcov_bootstrap")
30+
vcov_variational <- attr(coef(model), "vcov_variational")
31+
32+
data.frame(rmse = rmse(B_hat, B),
33+
cover_sandwich = mean(abs(as.numeric(B_hat - B) %*% solve(chol(vcov_sandwich))) < 1.96),
34+
cover_bootstrap = mean(abs(as.numeric(B_hat - B) %*% solve(chol(vcov_bootstrap))) < 1.96),
35+
cover_variational = mean(abs(as.numeric(B_hat - B) %*% solve(chol(vcov_variational))) < 1.96),
36+
sample_size = n,
37+
simu = s)
38+
}))
39+
}
1240

41+
res <- do.call(rbind, lapply(1:50, one_simu))
42+
43+
res %>%
44+
pivot_longer(-c(rmse, sample_size, simu), values_to = "coverage", names_to = "estimator") %>%
45+
ggplot() + aes(x = estimator, y = coverage, fill = factor(sample_size)) + geom_boxplot()
46+
47+
### Single test
48+
Y <- rPLN(n = nrow(X), mu = X %*% B, Sigma = params$Sigma, depths = params$depths)
1349
data <- prepare_data(Y, X, offset = "none")
1450
logO <- attr(Y, "offsets")
15-
16-
conf <- list(variational_var = TRUE, jackknife = TRUE, bootstrap = nrow(Y))
17-
future::plan("multicore", workers = nb_cores)
1851
model <- PLN(Abundance ~ 0 + . + offset(logO), data = data, control = PLN_param(config_post = conf))
19-
future::plan("sequential")
2052

2153
B_hat <- coef(model)
2254
B_se_var <- standard_error(model, "variational")
23-
B_se_jk <- standard_error(model, "jackknife")
2455
B_se_bt <- standard_error(model, "bootstrap")
56+
B_se_sw <- standard_error(model, "sandwich")
2557

2658
data.frame(
2759
B = rep(c(B), 3),
2860
B_hat = rep(c(B_hat), 3),
29-
se = c(B_se_var, B_se_jk, B_se_bt),
30-
method = rep(c("variational", "jackknife", "bootstrap"), each = length(c(B))) ) %>%
61+
se = c(B_se_var, B_se_bt, B_se_sw),
62+
method = rep(c("variational", "bootstrap", "sandwich"), each = length(c(B))) ) %>%
3163
ggplot(aes(x = B, y = B_hat)) +
3264
geom_errorbar(aes(ymin = B_hat - 2 * se,
3365
ymax = B_hat + 2 * se), color = "blue") + facet_wrap(~ method) +
3466
geom_abline(slope = 1, intercept = 0) + labs(x = "True value", y = "Mean estimate") + theme_bw() -> p
3567

3668
print(p)
37-

man/PLNfit.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)