Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
24e3c50
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Aug 29, 2025
25ba9a9
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Sep 5, 2025
20e9e0f
deleted demo leg index and slightly modified educ index script
bakeralexan Sep 8, 2025
0d6c375
deleted an unused line
bakeralexan Sep 8, 2025
24d50f2
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Sep 18, 2025
2b2b54b
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Oct 1, 2025
9a4de4a
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Oct 10, 2025
1233599
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Oct 20, 2025
8b14107
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Oct 23, 2025
6d6450b
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Nov 24, 2025
8ad9892
Merge branch 'staging' of https://github.com/catalystcalifornia/RaceC…
bakeralexan Feb 24, 2026
a532727
saving updates before creating additional branch
bakeralexan Mar 7, 2026
2164315
resolved merge conflicts
bakeralexan Mar 7, 2026
7d5547f
changing the name of my script to match the naming convention of the …
bakeralexan Mar 7, 2026
788e3c3
trying to undo any small changes to the reference script
bakeralexan Mar 7, 2026
c7f22b5
adding spaces back in
bakeralexan Mar 7, 2026
728fe26
replaced part of function
bakeralexan Mar 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 111 additions & 97 deletions MOSAIC/Functions/acs_fx.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ get_detailed_race <- function(table, race, year = 2021) {
str_detect(race, regex('asian', ignore_case = TRUE)) ~ '-04',
str_detect(race, regex('nhpi', ignore_case = TRUE)) ~ '-05',
.default = ''
)

)
# return error if race is not already included in available race_code list
if (!(race_code %in% c('-04','-05'))) {
return(print("This function doesn't pull data for the race you selected. Please select either Asian or NHPI or talk to Leila about adding an additional race."))
}

if (!(race_code %in% c('-04','-05'))) {
return(print("This function doesn't pull data for the race you selected. Please select either Asian or NHPI or talk to Leila about adding an additional race."))
}
table_name <- toupper(table)

city_api_call <- sprintf(
Expand All @@ -48,50 +48,50 @@ get_detailed_race <- function(table, race, year = 2021) {
state_api_call <- sprintf(
"https://api.census.gov/data/%s/acs/acs5/spt?get=group(%s)&POPGROUP=pseudo(%s)&ucgid=0400000US06",
year, table_name, race_code)

api_call_list <- c(city_api_call, county_api_call, state_api_call)

# Loop through API calls
parsed_list <- list() # create empty list for loop results

for (i in api_call_list) {
api_call_list <- c(city_api_call, county_api_call, state_api_call)

response <- GET(i)
status_code(response)
# Loop through API calls
parsed_list <- list() # create empty list for loop results

# Check if the request was successful
if (status_code(response) == 200) {
print("API request successful.")
} else {
stop("API request failed with status code: ", status_code(response))
for (i in api_call_list) {

response <- GET(i)
status_code(response)

# Check if the request was successful
if (status_code(response) == 200) {
print("API request successful.")
} else {
stop("API request failed with status code: ", status_code(response))
}

raw_content <- content(response, "text") # returns the response body as text
parsed_data <- fromJSON(raw_content)

parsed_list[[i]] <- parsed_data # put loop results into a list

}

raw_content <- content(response, "text") # returns the response body as text
parsed_data <- fromJSON(raw_content)

parsed_list[[i]] <- parsed_data # put loop results into a list
# clean up data
parsed_data <- do.call(rbind, parsed_list) %>% as.data.frame()
clean_data <- parsed_data
colnames(clean_data) <- clean_data[1, ] # replace col names with first row values
clean_data <- clean_data[, !duplicated(names(clean_data))] # drop any duplicate cols, eg: POPGROUP
clean_data <- clean_data %>%
mutate(across(contains(table_name), as.numeric)) # assign numeric cols to numeric type
clean_data <- clean_data %>%
filter(!if_all(where(is.numeric), is.na)) # drop rows where all numeric values are NA, this also removes the extra 'header' rows
clean_data$geoid <- str_replace(clean_data$GEO_ID, ".*US", "") # clean geoids
clean_data$geolevel <- case_when( # add geolevel bc it's a multigeo table
nchar(clean_data$geoid) == 2 ~ 'state',
nchar(clean_data$geoid) == 5 ~ 'county',
.default = 'place'
)

}

# clean up data
parsed_data <- do.call(rbind, parsed_list) %>% as.data.frame()
clean_data <- parsed_data
colnames(clean_data) <- clean_data[1, ] # replace col names with first row values
clean_data <- clean_data[, !duplicated(names(clean_data))] # drop any duplicate cols, eg: POPGROUP
clean_data <- clean_data %>%
mutate(across(contains(table_name), as.numeric)) # assign numeric cols to numeric type
clean_data <- clean_data %>%
filter(!if_all(where(is.numeric), is.na)) # drop rows where all numeric values are NA, this also removes the extra 'header' rows
clean_data$geoid <- str_replace(clean_data$GEO_ID, ".*US", "") # clean geoids
clean_data$geolevel <- case_when( # add geolevel bc it's a multigeo table
nchar(clean_data$geoid) == 2 ~ 'state',
nchar(clean_data$geoid) == 5 ~ 'county',
.default = 'place'
)

clean_data <- clean_data %>%
select(where(~!all(is.na(.)))) # drop cols where all vals are NA, eg: X_EA and X_MA the annotation cols

# reformat clean data
df_wide <- clean_data %>%
pivot_longer(
Expand All @@ -117,7 +117,7 @@ clean_data$geolevel <- case_when( # add geolevel
clean_geo_names() %>%
mutate(name = geoname) %>%
select(-geoname)

# prep metadata
metadata <- clean_data %>%
pivot_longer(cols = starts_with("B"),
Expand All @@ -129,8 +129,8 @@ clean_data$geolevel <- case_when( # add geolevel
unique() %>%
mutate(var_suff = sub(".*_", "", var),
generic_var = gsub(("E|M"), "", var),
new_var = tolower(paste0(table, "_", POPGROUP, "_", var_suff)))
new_var = tolower(paste0(table_code, "_", POPGROUP, "_", var_suff)))

# load variable names
v21 <- load_variables(year, "acs5", cache = TRUE)
table_vars <- v21 %>% filter(grepl(table_name, name))
Expand All @@ -152,14 +152,14 @@ clean_data$geolevel <- case_when( # add geolevel
new_label = c("","fips code","city, county, state"))

metadata_final <- rbind(new_rows, metadata_)

data_list <- list(df_wide, metadata_final)

names(data_list) <- c(paste0(race,"_df"), "metadata")

return(data_list)
}

data_list <- list(df_wide, metadata_final)

names(data_list) <- c(paste0(race,"_df"), "metadata")

return(data_list)
}


#### Send raw detailed tables to postgres - info for postgres tables automatically updates ####
send_to_mosaic <- function(acs_table, df_list, table_schema){
Expand Down Expand Up @@ -226,9 +226,10 @@ prep_acs <- function(x, race, table_code, cv_threshold, pop_threshold) {
# Execute the query and store the results in an R data frame
# column_metadata <- dbGetQuery(con, query) %>% filter(grepl("e",column_name)) %>% filter(grepl("001",column_name)) # get unique codes, eg: 051, 052
# print(column_metadata) # this df was used to create the renaming rules below

# renaming rules will change depending on type of census table
if (startsWith(table_code, "b") && startsWith(table_name, "nhpi")) {
if (startsWith(table_code, "b") && startsWith(table_name, "nhpi") && !startsWith(table_code, "b27001")) {

table_051_code = paste0(table_code, "_051_")
table_052_code = paste0(table_code, "_052_")
table_053_code = paste0(table_code, "_053_")
Expand Down Expand Up @@ -273,7 +274,7 @@ prep_acs <- function(x, race, table_code, cv_threshold, pop_threshold) {
names(x) <- gsub(table_176_code, "marshallese_aoic", names(x))
names(x) <- gsub(table_177_code, "palauan_aoic", names(x))

} else if (startsWith(table_code, "b") && startsWith(table_name, "asian")) {
} else if (startsWith(table_code, "b") && startsWith(table_name, "asian") && !startsWith(table_code, "b27001")) {
table_013_code = paste0(table_code, "_013_")
table_014_code = paste0(table_code, "_014_")
table_015_code = paste0(table_code, "_015_")
Expand Down Expand Up @@ -360,7 +361,7 @@ prep_acs <- function(x, race, table_code, cv_threshold, pop_threshold) {
names(x) <- gsub(table_084_code, "nepalese_aoic", names(x))
names(x) <- gsub(table_085_code, "okinawan_aoic", names(x))

} else {
} else if (!startsWith(table_code, "b27001")) {
stop('The column renaming function did not work for the table you have submitted. Please check your table.')
}

Expand Down Expand Up @@ -537,41 +538,52 @@ prep_acs <- function(x, race, table_code, cv_threshold, pop_threshold) {

}

if(endsWith(table_code, "b23025")) { # MK: EMPLOYMENT DETAILED TABLE

x <- x %>% select(-contains(c("002", "003","005", "006", "007"))) %>% # dropping everything but total and employed
select(geoid, name, geolevel, everything())

if (startsWith(table_code, "b27001")) {

# pivot longer
x_long <- x %>%
pivot_longer(
cols = -c(geoid, name, geolevel),
names_to = c("ethnic_group", "line", "stat"),
names_pattern = "^(.*?)(001|004)(e|m)$",
names_to = c("ethnic_group", "measure"),
names_pattern = "^(b27001_[a-z0-9]+)_(pop_moe|raw_moe|pop|raw)$",
values_to = "value"
) %>%
pivot_wider(names_from = measure, values_from = value) %>%
mutate(
measure = case_when(
line == "001" & stat == "e" ~ "pop",
line == "001" & stat == "m" ~ "pop_moe",
line == "004" & stat == "e" ~ "raw",
line == "004" & stat == "m" ~ "raw_moe"
)
) %>%
select(-line, -stat) %>%
pivot_wider(
names_from = measure,
values_from = value
rate = ifelse(pop <= 0 | is.na(pop), NA, raw / pop * 100),
rate_moe = moe_prop(raw, pop, raw_moe, pop_moe) * 100
)
# rename ethnic_group codes to labels
code_to_label <- c(
"b27001_013" = "indian", "b27001_014" = "bangladeshi",
"b27001_015" = "cambodian", "b27001_016" = "chinese",
"b27001_017" = "chinese_no_taiwan", "b27001_018" = "taiwanese",
"b27001_019" = "filipino", "b27001_020" = "hmong",
"b27001_021" = "indonesian", "b27001_022" = "japanese",
"b27001_023" = "korean", "b27001_024" = "laotian",
"b27001_025" = "malaysian", "b27001_026" = "pakistani",
"b27001_027" = "sri_lankan", "b27001_028" = "thai",
"b27001_029" = "vietnamese", "b27001_032" = "indian_aoic",
"b27001_033" = "bangladeshi_aoic", "b27001_034" = "cambodian_aoic",
"b27001_035" = "chinese_aoic", "b27001_036" = "chinese_no_taiwan_aoic",
"b27001_037" = "taiwanese_aoic", "b27001_038" = "filipino_aoic",
"b27001_039" = "hmong_aoic", "b27001_040" = "indonesian_aoic",
"b27001_041" = "japanese_aoic", "b27001_042" = "korean_aoic",
"b27001_043" = "laotian_aoic", "b27001_044" = "malaysian_aoic",
"b27001_045" = "pakistani_aoic", "b27001_046" = "sri_lankan_aoic",
"b27001_047" = "thai_aoic", "b27001_048" = "vietnamese_aoic",
"b27001_072" = "bhutanese", "b27001_073" = "burmese",
"b27001_075" = "mongolian", "b27001_076" = "nepalese",
"b27001_081" = "burmese_aoic", "b27001_083" = "mongolian_aoic",
"b27001_084" = "nepalese_aoic", "b27001_085" = "okinawan_aoic"
)

# calc raced rates
x_long <- x_long %>%
mutate(rate = ifelse(pop <= 0, NA, raw / pop * 100),
rate_moe = moe_prop(raw, pop, raw_moe, pop_moe)*100)

mutate(ethnic_group = recode(ethnic_group, !!!code_to_label))
}

if (startsWith(table_code, "s2802") | startsWith(table_code, "s2701")) {
if (startsWith(table_code, "s2802")) {

old_names <- colnames(x)[-(1:3)]
new_names <- c("total_pop", "black_pop", "aian_pop", "asian_pop", "pacisl_pop",
"other_pop", "twoormor_pop", "latino_pop", "nh_white_pop",
Expand Down Expand Up @@ -684,20 +696,20 @@ prep_acs <- function(x, race, table_code, cv_threshold, pop_threshold) {
}

# Finish up data cleaning
# make colnames lower case
colnames(x_long) <- tolower(colnames(x_long))
# Clean geo names
x_long$name <- gsub(", California", "", x_long$name)
x_long$name <- gsub(" County", "", x_long$name)
x_long$name <- gsub(" city", "", x_long$name)
x_long$name <- gsub(" town", "", x_long$name)
x_long$name <- gsub(" CDP", "", x_long$name)
x_long$name <- str_remove(x_long$name, "\\s*\\(.*\\)\\s*")
x_long$name <- gsub("; California", "", x_long$name)
# make colnames lower case
colnames(x_long) <- tolower(colnames(x_long))

# Clean geo names
x_long$name <- gsub(", California", "", x_long$name)
x_long$name <- gsub(" County", "", x_long$name)
x_long$name <- gsub(" city", "", x_long$name)
x_long$name <- gsub(" town", "", x_long$name)
x_long$name <- gsub(" CDP", "", x_long$name)
x_long$name <- str_remove(x_long$name, "\\s*\\(.*\\)\\s*")
x_long$name <- gsub("; California", "", x_long$name)

### Coefficient of Variation (CV) CALCS #####

### calc cv's
## Calculate CV values for all rates - store in columns as cv_[race]_rate
if (!is.na(cv_threshold)){
Expand All @@ -722,14 +734,17 @@ prep_acs <- function(x, race, table_code, cv_threshold, pop_threshold) {
# if pop_threshold ex_longists and cv_threshold ex_longists, check population and cv (i.e. B25003, S2301, S2802, S2701, B25014)
## Screen out rates with high CVs and low populations
df$rate <- ifelse(df$rate_cv > cv_threshold, NA, ifelse(df$pop < pop_threshold, NA, df$rate))
df$raw <- ifelse(df$rate_cv > cv_threshold, NA, ifelse(df$pop < pop_threshold, NA, df$raw))

if (startsWith(table_code, "b27001")) {
# for b27001 only suppress raw based on pop threshold, not CV since the cvs are so big for uninsured
df$raw <- ifelse(df$pop < pop_threshold, NA, df$raw)
} else {
df$raw <- ifelse(df$rate_cv > cv_threshold, NA, ifelse(df$pop < pop_threshold, NA, df$raw))
}
} else {
# Only DP05 should hit this condition
# Will use to change population values < 0 to NA (negative values are Census annotations)
pop_columns <- colnames(dplyr::select(df, ends_with("_pop")))
df[,pop_columns] <- sapply(df[,pop_columns], function(x_long) ifelse(x_long<0, NA, x_long))

df[,pop_columns] <- sapply(df[,pop_columns], function(val) ifelse(val < 0, NA, val)) # confusing if variable is also called x_long so I changed it for clarity
}

df_wide <- pivot_wider(df,
Expand All @@ -740,5 +755,4 @@ prep_acs <- function(x, race, table_code, cv_threshold, pop_threshold) {
df_wide$total_rate <- NA # add dummy total_rate col so RC_Functions work as-is

return(df_wide)
}

}
Loading