easystats
diff --git a/‎NAMESPACE‎
Lines changed: 3 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 7 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎R/residualize_over_grid.R‎
Lines changed: 164 additions & 0 deletions b/‎R/residualize_over_grid.R‎
Lines changed: 164 additions & 0 deletions
diff --git a/‎R/visualisation_recipe.R‎
Lines changed: 7 additions & 0 deletions b/‎R/visualisation_recipe.R‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎R/visualisation_recipe_internal.R‎
Lines changed: 67 additions & 10 deletions b/‎R/visualisation_recipe_internal.R‎
Lines changed: 67 additions & 10 deletions
@@ -56,6 +56,8 @@ S3method(print_md,visualisation_matrix)
 S3method(reshape_grouplevel,data.frame)
 S3method(reshape_grouplevel,default)
 S3method(reshape_grouplevel,estimate_grouplevel)
+S3method(residualize_over_grid,data.frame)
+S3method(residualize_over_grid,estimate_means)
 S3method(smoothing,data.frame)
 S3method(smoothing,numeric)
 S3method(standardize,estimate_contrasts)
@@ -99,6 +101,7 @@ export(pool_slopes)
 export(print_html)
 export(print_md)
 export(reshape_grouplevel)
+export(residualize_over_grid)
 export(smoothing)
 export(standardize)
 export(unstandardize)
 
@@ -31,6 +31,13 @@
 
 * `estimate_grouplevel()` now supports models from package *coxme*.
 
+* New function `residualize_over_grid()`, which residualizes a model
+  over a grid of predictors. This is useful to visualize the residuals of a
+  model over a grid of predictors.
+
+* `visualisation_recipe()` and `plot()` get a `show_residuals` argument,
+  to show the residuals of the model, related to the data grid, in the plot.
+
 * Documentation of the `display()` method for *modelbased* objects has been
   added.
 
 
@@ -0,0 +1,164 @@
+#' @title Compute partial residuals from a data grid
+#' @name residualize_over_grid
+#'
+#' @description This function computes partial residuals based on a data grid,
+#' where the data grid is usually a data frame from all combinations of factor
+#' variables or certain values of numeric vectors. This data grid is usually used
+#' as `newdata` argument in `predict()`, and can be created with
+#' [`insight::get_datagrid()`].
+#'
+#' @param grid A data frame representing the data grid, or an object of class
+#' `estimate_means` or `estimate_predicted`, as returned by the different
+#' `estimate_*()` functions.
+#' @param model The model for which to compute partial residuals. The data grid
+#' `grid` should match to predictors in the model.
+#' @param predictor_name The name of the focal predictor, for which partial residuals
+#' are computed.
+#' @param ... Currently not used.
+#'
+#' @section Partial Residuals:
+#' For **generalized linear models** (glms), residualized scores are computed as
+#' `inv.link(link(Y) + r)` where `Y` are the predicted values on the response
+#' scale, and `r` are the *working* residuals.
+#'
+#' For (generalized) linear **mixed models**, the random effect are also
+#' partialled out.
+#'
+#' @references
+#' Fox J, Weisberg S. Visualizing Fit and Lack of Fit in Complex Regression
+#' Models with Predictor Effect Plots and Partial Residuals. Journal of
+#' Statistical Software 2018;87.
+#'
+#' @return A data frame with residuals for the focal predictor.
+#'
+#' @examplesIf requireNamespace("marginaleffects", quietly = TRUE)
+#' set.seed(1234)
+#' x1 <- rnorm(200)
+#' x2 <- rnorm(200)
+#' # quadratic relationship
+#' y <- 2 * x1 + x1^2 + 4 * x2 + rnorm(200)
+#'
+#' d <- data.frame(x1, x2, y)
+#' model <- lm(y ~ x1 + x2, data = d)
+#'
+#' pr <- estimate_means(model, c("x1", "x2"))
+#' head(residualize_over_grid(pr, model))
+#' @export
+residualize_over_grid <- function(grid, model, ...) {
+  UseMethod("residualize_over_grid")
+}
+
+
+#' @rdname residualize_over_grid
+#' @export
+residualize_over_grid.data.frame <- function(grid, model, predictor_name, ...) {
+  old_d <- insight::get_predictors(model)
+  fun_link <- insight::link_function(model)
+  inv_fun <- insight::link_inverse(model)
+  predicted <- grid[[predictor_name]]
+  grid[[predictor_name]] <- NULL
+
+  is_fixed <- sapply(grid, function(x) length(unique(x))) == 1
+  grid <- grid[, !is_fixed, drop = FALSE]
+  old_d <- old_d[, colnames(grid)[colnames(grid) %in% colnames(old_d)], drop = FALSE]
+
+  if (!.is_grid(grid)) {
+    insight::format_error("Grid for partial residuals must be a fully crossed grid.")
+  }
+
+  # for each var
+  best_match <- NULL
+
+  for (p in colnames(old_d)) {
+    if (is.numeric(old_d[[p]])) {
+      grid[[p]] <- .validate_num(grid[[p]])
+    }
+    # if numeric in old data, find where it is closest
+    best_match <- .closest(old_d[[p]], grid[[p]], best_match = best_match)
+  }
+
+  idx <- apply(best_match, 2, which)
+  idx <- sapply(idx, "[", 1)
+
+  # extract working residuals
+  res <- .safe(stats::residuals(model, type = "working"))
+
+  # if failed, and model linear, extract response residuals
+  if (is.null(res)) {
+    minfo <- insight::model_info(model)
+    if (minfo$is_linear) {
+      res <- .safe(insight::get_residuals(model, type = "response"))
+    }
+  }
+
+  if (is.null(res)) {
+    insight::format_alert("Could not extract residuals.")
+    return(NULL)
+  }
+
+  my_points <- grid[idx, , drop = FALSE]
+  my_points[[predictor_name]] <- inv_fun(fun_link(predicted[idx]) + res) # add errors
+
+  my_points
+}
+
+
+#' @export
+residualize_over_grid.estimate_means <- function(grid, model, ...) {
+  new_d <- as.data.frame(grid)
+
+  relevant_columns <- unique(c(
+    attributes(grid)$trend,
+    attributes(grid)$contrast,
+    attributes(grid)$focal_terms,
+    attributes(grid)$coef_name
+  ))
+
+  new_d <- new_d[colnames(new_d) %in% relevant_columns]
+
+  residualize_over_grid(new_d, model, predictor_name = attributes(grid)$coef_name, ...)
+}
+
+
+# utilities --------------------------------------------------------------------
+
+
+.is_grid <- function(df) {
+  unq <- lapply(df, unique)
+
+  if (prod(lengths(unq)) != nrow(df)) {
+    return(FALSE)
+  }
+
+  df2 <- do.call(expand.grid, args = unq)
+  df2$..1 <- 1
+
+  res <- merge(df, df2, by = colnames(df), all = TRUE)
+
+  sum(res$..1) == sum(df2$..1)
+}
+
+
+.closest <- function(x, target, best_match) {
+  if (is.numeric(x)) {
+
+    AD <- abs(outer(x, target, FUN = `-`))
+    idx <- apply(AD, 1, function(x) x == min(x))
+  } else {
+    idx <- t(outer(x, target, FUN = `==`))
+  }
+
+  if (is.matrix(best_match)) {
+    idx <- idx & best_match
+  }
+
+  idx
+}
+
+
+.validate_num <- function(x) {
+  if (!is.numeric(x)) {
+    x <- as.numeric(as.character(x))
+  }
+  x
+}
@@ -39,6 +39,9 @@
 #' predictor. Use `FALSE` to always use continuous color scales for numeric
 #' predictors. It is possible to set a global default value using `options()`,
 #' e.g. `options(modelbased_numeric_as_discrete = 10)`.
+#' @param show_residuals Logical, if `TRUE`, display residuals of the model
+#' as a background to the model-based estimation. Residuals will be computed
+#' for the predictors in the data grid, using [`residualize_over_grid()`].
 #' @param point,line,pointrange,ribbon,facet,grid Additional
 #' aesthetics and parameters for the geoms (see customization example).
 #' @param ... Arguments passed from `plot()` to `visualisation_recipe()`, or
@@ -150,6 +153,7 @@
 #' @export
 visualisation_recipe.estimate_predicted <- function(x,
                                                     show_data = FALSE,
+                                                    show_residuals = FALSE,
                                                     point = NULL,
                                                     line = NULL,
                                                     pointrange = NULL,
@@ -173,6 +177,7 @@ visualisation_recipe.estimate_predicted <- function(x,
   .visualization_recipe(
     x,
     show_data = show_data,
+    show_residuals = show_residuals,
     point = point,
     line = line,
     pointrange = pointrange,
@@ -230,6 +235,7 @@ visualisation_recipe.estimate_slopes <- function(x,
   .visualization_recipe(
     x,
     show_data = FALSE,
+    show_residuals = FALSE,
     line = line,
     pointrange = pointrange,
     ribbon = ribbon,
@@ -287,6 +293,7 @@ visualisation_recipe.estimate_grouplevel <- function(x,
   .visualization_recipe(
     x,
     show_data = FALSE,
+    show_residuals = FALSE,
     line = line,
     pointrange = pointrange,
     ribbon = ribbon,
 
@@ -246,6 +246,7 @@
 #' @keywords internal
 .visualization_recipe <- function(x,
                                   show_data = TRUE,
+                                  show_residuals = FALSE,
                                   point = NULL,
                                   line = NULL,
                                   pointrange = NULL,
@@ -276,7 +277,7 @@
 
   # Don't plot raw data if `predict` is not on the response scale
   if (!is.null(response_scale) && !response_scale %in% c("prediction", "response", "expectation", "invlink(link)")) {
-    show_data <- FALSE
+    show_data <- show_residuals <- FALSE
   }
 
   # Don't plot raw data for transformed responses with no back-transformation
@@ -286,20 +287,27 @@
     # add information about response transformation
     trans_fun <- .safe(insight::find_transformation(attributes(x)$model))
     if (!is.null(trans_fun) && all(trans_fun != "identity")) {
-      show_data <- FALSE
+      show_data <- show_residuals <- FALSE
     }
   }
 
-
   # add raw data as first layer ----------------------------------
   if (show_data) {
-    layers[[paste0("l", l)]] <- .visualization_recipe_rawdata(x, aes)
+    layers[[paste0("l", l)]] <- .visualization_recipe_rawdata(x, aes, numeric_as_discrete)
     # Update with additional args
     if (!is.null(point)) layers[[paste0("l", l)]] <- utils::modifyList(layers[[paste0("l", l)]], point)
     l <- l + 1
   }
 
 
+  # add residual data as next lowest layer
+  if (show_residuals) {
+    layers[[paste0("l", l)]] <- .visualization_recipe_residuals(x, aes, numeric_as_discrete)
+    # Update with additional args
+    if (!is.null(point)) layers[[paste0("l", l)]] <- utils::modifyList(layers[[paste0("l", l)]], point)
+    l <- l + 1
+  }
+
   # intercept line for slopes ----------------------------------
   if (inherits(x, "estimate_slopes")) {
     layers[[paste0("l", l)]] <- insight::compact_list(list(
@@ -469,15 +477,64 @@
 
 
 #' @keywords internal
-.visualization_recipe_rawdata <- function(x, aes) {
+.visualization_recipe_rawdata <- function(x, aes, numeric_as_discrete = 8) {
   model <- attributes(x)$model
   rawdata <- insight::get_data(model, verbose = FALSE)
 
   # Add response to data if not there
   y <- insight::find_response(attributes(x)$model)
-  if (!y %in% names(rawdata)) rawdata[y] <- insight::get_response(attributes(x)$model, verbose = FALSE)
+  if (!y %in% names(rawdata)) {
+    rawdata[y] <- insight::get_response(attributes(x)$model, verbose = FALSE)
+  }
+
+  # if we have less than 8 values for the legend, a continuous color scale
+  # is used by default - we then must convert values into factors, when we
+  # show data or residuals - but we must ensure that the levels are sorted
+  # according to the original data grid, thus we need "sort()"
+  if (!is.null(aes$color) && is.numeric(rawdata[[aes$color]]) && insight::n_unique(rawdata[[aes$color]]) < numeric_as_discrete) {
+    new_values <- insight::format_value(rawdata[[aes$color]], protect_integers = TRUE)
+    rawdata[[aes$color]] <- factor(new_values, levels = as.character(sort(as.numeric(unique(new_values)))))
+  }
+
+  .data_point_geom(
+    model = model,
+    aes = aes,
+    data = rawdata,
+    y = y
+  )
+}
 
-  if (aes$type == "pointrange" && !is.numeric(rawdata[[aes$x]])) {
+
+# residuals ----------------------------------------------------------------
+
+
+#' @keywords internal
+.visualization_recipe_residuals <- function(x, aes, numeric_as_discrete = 8) {
+  model <- attributes(x)$model
+  residual_data <- residualize_over_grid(x, model)
+
+  # if we have less than 8 values for the legend, a continuous color scale
+  # is used by default - we then must convert values into factors, when we
+  # show data or residuals - but we must ensure that the levels are sorted
+  # according to the original data grid, thus we need "sort()"
+  if (!is.null(aes$color) && is.numeric(residual_data[[aes$color]]) && insight::n_unique(residual_data[[aes$color]]) < numeric_as_discrete) {
+    new_values <- insight::format_value(residual_data[[aes$color]], protect_integers = TRUE)
+    residual_data[[aes$color]] <- factor(new_values, levels = as.character(sort(as.numeric(unique(new_values)))))
+  }
+
+  .data_point_geom(
+    model = model,
+    aes = aes,
+    data = residual_data,
+    y = "Mean"
+  )
+}
+
+
+# helpers -----------------------------------------------------------------
+
+.data_point_geom <- function(model, aes, data, y) {
+  if (aes$type == "pointrange" && !is.numeric(data[[aes$x]])) {
     geom <- "jitter"
   } else {
     geom <- "point"
@@ -493,7 +550,7 @@
 
   out <- list(
     geom = geom,
-    data = rawdata,
+    data = data,
     aes = list(
       y = y,
       x = aes$x,
@@ -508,10 +565,10 @@
   # check if we have matching columns in the raw data - some functions,
   # likes slopes, have mapped these aes to other columns that are not part
   # of the raw data - we set them to NULL
-  if (!is.null(aes$color) && !aes$color %in% colnames(rawdata)) {
+  if (!is.null(aes$color) && !aes$color %in% colnames(data)) {
     out$aes$color <- NULL
   }
-  if (!is.null(aes$alpha) && !aes$alpha %in% colnames(rawdata)) {
+  if (!is.null(aes$alpha) && !aes$alpha %in% colnames(data)) {
     out$aes$alpha <- NULL
   }