catalystcalifornia · lforouzan · Mar 4, 2026 · Mar 4, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/MOSAIC/Functions/pums_fx.R b/MOSAIC/Functions/pums_fx.R
@@ -1,26 +1,26 @@
 ##### Reclassify Asian and NHPI Ancestries ######## 
-anhpi_reclass <- function(x, acs_yr, ancestry_list) {  # used in MOSAIC Living Wage test
+anhpi_reclass <- function(x, ancestry_list) {  
   # x = pums dataframe
   # acs_yr = last year of ACS 5y estimates
   # ancestry_list = dataframe containing all PUMS ANC1P/ANC2P codes and labels, plus 1 binary column for asian and 1 for nhpi
   ## import ANC1P/ANC2P codes/labels pulled from https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2023.pdf
-  print(paste0("Adding ancestry labels from:", ancestry_list))
+  message("Adding ancestry labels from your selected ancestry_list")
   anc_codes <- ancestry_list %>%
     mutate(anc_label = tolower(gsub(" ", "_", anc_label)))
-
+  
   # get list of AA or PI ancestries actually in CA PUMS data
   aapi_incl <- x %>% select(ANC1P) %>% 
     unique() %>%
     inner_join(anc_codes %>% filter((asian == 1 | nhpi ==1)), by ="ANC1P") %>%
     arrange(anc_label)
-  print("Displaying all the AA or PI ancestries actually present in the data as aapi_incl df.")
+  message("Displaying all the AA or PI ancestries actually present in the data as aapi_incl df.")
   View(aapi_incl)
-
+  
   x <- x %>% left_join(anc_codes %>% select(ANC1P, anc_label))
   x <- x %>% left_join(anc_codes %>% select(ANC1P, anc_label), by = c("ANC2P" = "ANC1P"))
   reclass_list <- list(people = x, aapi_incl = aapi_incl)
 
-return(reclass_list)
+  return(reclass_list)
 }
 
 
@@ -162,7 +162,128 @@ calc_pums_pop <- function(var_name) {
 return(num_df)  
 }
 
+##### Calc ANHPI PUMS pop ######## 
+calc_pums_ind <- function(d, weight, repwlist, vars, indicator) {
+  # d = dataframe
+  # weight = defined earlier in script. PWGTP for person-level (psam_p06.csv) or WGTP for housing unit-level (psam_h06.csv) analysis
+  # repwlist = defined earlier in script.
+  # vars = the list of asian and nhpi subgroups (aapi_incl$anc_label)
+  # indicator = name of column that contains indicator data, eg: 'living_wage' which contains values 'livable' and 'not livable'
+  ### Then run something like this: pop_table <- map(vars, ppl_state)   |> list_rbind()
 
+  library(purrr)
+  library(readr)
+
+  start_time <- Sys.time()  # start timer
+
+  # ── 1. Build survey & denominators ONCE ─────────────────────────────────────
+  message("Building survey object...")
+
+  anhpi_srvy <- d %>%
+    as_survey_rep(
+      variables        = c(geoid, geoname, asian, nhpi, all_of(vars), !!sym(indicator)),
+      weights          = !!sym(weight),
+      repweights       = all_of(repwlist),
+      combined_weights = TRUE,
+      mse              = TRUE,
+      type             = "other",
+      scale            = 4/80,
+      rscale           = rep(1, 80)
+    ) %>%
+    filter(!is.na(!!sym(indicator)))   # filter out rows where indicator is NA
+
+  asian_srvy <- anhpi_srvy %>% filter(asian == 1)
+  nhpi_srvy  <- anhpi_srvy %>% filter(nhpi  == 1)
+
+  # ── 2. Helper: compute metrics from num/rate SE columns ──────────────────────
+  add_metrics <- function(df, den_df, group_label, subgroup_label) {
+    df %>%
+      left_join(den_df, by = c("geoid", "geoname")) %>%
+      mutate(
+        group     = group_label,
+        subgroup  = subgroup_label,
+        rate_moe  = rate_se * 1.645 * 100,
+        rate_cv   = ifelse(rate > 0, (rate_se / rate) * 100, NA_real_),
+        rate      = rate * 100,
+        count_moe = num_se * 1.645,
+        count_cv  = ifelse(num   > 0, (num_se  / num)  * 100, NA_real_)
+      )
+  }
+
+  # ── 3. GROUP-LEVEL: total, asian & nhpi ───────────
+  # Estimates/rates for total and records where asian == 1 or nhpi == 1,
+  # grouped by indicator value
+  message("Calculating total and group-level stats (asian, nhpi)...")
+
+  calc_group <- function(srvy, group_label) {
+
+    # calc denominator
+    den_ <- srvy %>%
+      group_by(geoid, geoname) %>%
+      summarise(pop = survey_total(na.rm = TRUE), .groups = "drop")
+
+    # calc raw, rate	
+    srvy %>%
+      group_by(geoid, geoname, !!sym(indicator)) %>%
+      summarise(
+        num  = survey_total(na.rm = TRUE),
+        rate = survey_mean(na.rm = TRUE),
+        .groups = "drop"
+      ) %>%
+      add_metrics(den_, group_label, group_label)
+  }
+
+  df_total <- calc_group(anhpi_srvy, "total")
+  df_asian <- calc_group(asian_srvy, "asian")
+  df_nhpi  <- calc_group(nhpi_srvy, "nhpi")
+
+  df_group <- bind_rows(df_total, df_asian, df_nhpi)
+
+  # ── 4. SUBGROUP-LEVEL: each var in vars, denominator = var == 1 ──────────────
+  # Estimates/rates grouped by indicator value,
+  # denominator = all records where that var == 1
+  calc_one_subgroup <- function(var_name) {
+    message(paste0("  Calculating subgroup: ", var_name))
+
+    # Determine ancestry group to pick correct survey & denominator
+    den_group <- case_when(
+      var_name %in% (aapi_incl %>% filter(nhpi  == 1) %>% pull(anc_label)) ~ "nhpi",
+      var_name %in% (aapi_incl %>% filter(asian == 1) %>% pull(anc_label)) ~ "asian"
+    )
+
+    srvy_subset <- switch(den_group, "asian" = asian_srvy, "nhpi" = nhpi_srvy)
+
+    # Denominator: all records in this subgroup (var == 1)
+    den_var <- srvy_subset %>%
+      filter(.data[[var_name]] == 1) %>%
+      group_by(geoid, geoname) %>%
+      summarise(pop = survey_total(na.rm = TRUE), .groups = "drop")
+
+    # Numerator: subgroup records grouped by indicator value
+    srvy_subset %>%
+      filter(.data[[var_name]] == 1) %>%
+      group_by(geoid, geoname, !!sym(indicator)) %>%
+      summarise(
+        num  = survey_total(na.rm = TRUE),
+        rate = survey_mean(na.rm = TRUE),
+        .groups = "drop"
+      ) %>%
+      add_metrics(den_var, den_group, var_name)
+  }
+
+  # ── 5. Run subgroup calcs sequentially (survey too large for parallel export) ─
+  message("Running subgroup calculations...")
+
+  num_df_subgroups <- map(vars, calc_one_subgroup) |> list_rbind()
+
+  # ── 6. Combine & return ──────────────────────────────────────────────────────
+  result <- bind_rows(df_group, num_df_subgroups)
+
+  elapsed <- round(difftime(Sys.time(), start_time, units = "mins"), 2)
+  message(paste0("Done! calc_pums_ind completed in ", elapsed, " minutes."))
+
+return(result)
+}