Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 127 additions & 6 deletions MOSAIC/Functions/pums_fx.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
##### Reclassify Asian and NHPI Ancestries ########
anhpi_reclass <- function(x, acs_yr, ancestry_list) { # used in MOSAIC Living Wage test
anhpi_reclass <- function(x, ancestry_list) {
# x = pums dataframe
# acs_yr = last year of ACS 5y estimates
# ancestry_list = dataframe containing all PUMS ANC1P/ANC2P codes and labels, plus 1 binary column for asian and 1 for nhpi
## import ANC1P/ANC2P codes/labels pulled from https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2023.pdf
print(paste0("Adding ancestry labels from:", ancestry_list))
message("Adding ancestry labels from your selected ancestry_list")
anc_codes <- ancestry_list %>%
mutate(anc_label = tolower(gsub(" ", "_", anc_label)))

# get list of AA or PI ancestries actually in CA PUMS data
aapi_incl <- x %>% select(ANC1P) %>%
unique() %>%
inner_join(anc_codes %>% filter((asian == 1 | nhpi ==1)), by ="ANC1P") %>%
arrange(anc_label)
print("Displaying all the AA or PI ancestries actually present in the data as aapi_incl df.")
message("Displaying all the AA or PI ancestries actually present in the data as aapi_incl df.")
View(aapi_incl)

x <- x %>% left_join(anc_codes %>% select(ANC1P, anc_label))
x <- x %>% left_join(anc_codes %>% select(ANC1P, anc_label), by = c("ANC2P" = "ANC1P"))
reclass_list <- list(people = x, aapi_incl = aapi_incl)

return(reclass_list)
return(reclass_list)
}


Expand Down Expand Up @@ -162,7 +162,128 @@ calc_pums_pop <- function(var_name) {
return(num_df)
}

##### Calc ANHPI PUMS pop ########
calc_pums_ind <- function(d, weight, repwlist, vars, indicator) {
# d = dataframe
# weight = defined earlier in script. PWGTP for person-level (psam_p06.csv) or WGTP for housing unit-level (psam_h06.csv) analysis
# repwlist = defined earlier in script.
# vars = the list of asian and nhpi subgroups (aapi_incl$anc_label)
# indicator = name of column that contains indicator data, eg: 'living_wage' which contains values 'livable' and 'not livable'
### Then run something like this: pop_table <- map(vars, ppl_state) |> list_rbind()

library(purrr)
library(readr)

start_time <- Sys.time() # start timer

# ── 1. Build survey & denominators ONCE ─────────────────────────────────────
message("Building survey object...")

anhpi_srvy <- d %>%
as_survey_rep(
variables = c(geoid, geoname, asian, nhpi, all_of(vars), !!sym(indicator)),
weights = !!sym(weight),
repweights = all_of(repwlist),
combined_weights = TRUE,
mse = TRUE,
type = "other",
scale = 4/80,
rscale = rep(1, 80)
) %>%
filter(!is.na(!!sym(indicator))) # filter out rows where indicator is NA

asian_srvy <- anhpi_srvy %>% filter(asian == 1)
nhpi_srvy <- anhpi_srvy %>% filter(nhpi == 1)

# ── 2. Helper: compute metrics from num/rate SE columns ──────────────────────
add_metrics <- function(df, den_df, group_label, subgroup_label) {
df %>%
left_join(den_df, by = c("geoid", "geoname")) %>%
mutate(
group = group_label,
subgroup = subgroup_label,
rate_moe = rate_se * 1.645 * 100,
rate_cv = ifelse(rate > 0, (rate_se / rate) * 100, NA_real_),
rate = rate * 100,
count_moe = num_se * 1.645,
count_cv = ifelse(num > 0, (num_se / num) * 100, NA_real_)
)
}

# ── 3. GROUP-LEVEL: total, asian & nhpi ───────────
# Estimates/rates for total and records where asian == 1 or nhpi == 1,
# grouped by indicator value
message("Calculating total and group-level stats (asian, nhpi)...")

calc_group <- function(srvy, group_label) {

# calc denominator
den_ <- srvy %>%
group_by(geoid, geoname) %>%
summarise(pop = survey_total(na.rm = TRUE), .groups = "drop")

# calc raw, rate
srvy %>%
group_by(geoid, geoname, !!sym(indicator)) %>%
summarise(
num = survey_total(na.rm = TRUE),
rate = survey_mean(na.rm = TRUE),
.groups = "drop"
) %>%
add_metrics(den_, group_label, group_label)
}

df_total <- calc_group(anhpi_srvy, "total")
df_asian <- calc_group(asian_srvy, "asian")
df_nhpi <- calc_group(nhpi_srvy, "nhpi")

df_group <- bind_rows(df_total, df_asian, df_nhpi)

# ── 4. SUBGROUP-LEVEL: each var in vars, denominator = var == 1 ──────────────
# Estimates/rates grouped by indicator value,
# denominator = all records where that var == 1
calc_one_subgroup <- function(var_name) {
message(paste0(" Calculating subgroup: ", var_name))

# Determine ancestry group to pick correct survey & denominator
den_group <- case_when(
var_name %in% (aapi_incl %>% filter(nhpi == 1) %>% pull(anc_label)) ~ "nhpi",
var_name %in% (aapi_incl %>% filter(asian == 1) %>% pull(anc_label)) ~ "asian"
)

srvy_subset <- switch(den_group, "asian" = asian_srvy, "nhpi" = nhpi_srvy)

# Denominator: all records in this subgroup (var == 1)
den_var <- srvy_subset %>%
filter(.data[[var_name]] == 1) %>%
group_by(geoid, geoname) %>%
summarise(pop = survey_total(na.rm = TRUE), .groups = "drop")

# Numerator: subgroup records grouped by indicator value
srvy_subset %>%
filter(.data[[var_name]] == 1) %>%
group_by(geoid, geoname, !!sym(indicator)) %>%
summarise(
num = survey_total(na.rm = TRUE),
rate = survey_mean(na.rm = TRUE),
.groups = "drop"
) %>%
add_metrics(den_var, den_group, var_name)
}

# ── 5. Run subgroup calcs sequentially (survey too large for parallel export) ─
message("Running subgroup calculations...")

num_df_subgroups <- map(vars, calc_one_subgroup) |> list_rbind()

# ── 6. Combine & return ──────────────────────────────────────────────────────
result <- bind_rows(df_group, num_df_subgroups)

elapsed <- round(difftime(Sys.time(), start_time, units = "mins"), 2)
message(paste0("Done! calc_pums_ind completed in ", elapsed, " minutes."))

return(result)
}



Expand Down
Loading