Mini Project 02: Making Backyards Affordable for All

Author

Dolma Sherpa

Introduction

Background on housing affordability, motivation, and data sources (ACS, BLS).

Task 1 – Data Acquisition

The following chunk contains all the data import code provided and is folded for organization.

Show code

#Data import
if(!dir.exists(file.path("data", "mp02"))){
  dir.create(file.path("data", "mp02"), showWarnings=FALSE, recursive=TRUE)
}

library <- function(pkg){
  ## Mask base::library() to automatically install packages if needed
  ## Masking is important here so downlit picks up packages and links
  ## to documentation
  pkg <- as.character(substitute(pkg))
  options(repos = c(CRAN = "https://cloud.r-project.org"))
  if(!require(pkg, character.only=TRUE, quietly=TRUE)) install.packages(pkg)
  stopifnot(require(pkg, character.only=TRUE, quietly=TRUE))
}

library(tidyverse)
library(glue)
library(readxl)
library(tidycensus)

get_acs_all_years <- function(variable, geography="cbsa",
                              start_year=2009, end_year=2023){
  fname <- glue("{variable}_{geography}_{start_year}_{end_year}.csv")
  fname <- file.path("data", "mp02", fname)
  
  if(!file.exists(fname)){
    YEARS <- seq(start_year, end_year)
    YEARS <- YEARS[YEARS != 2020] # Drop 2020 - No survey (covid)
    
    ALL_DATA <- map(YEARS, function(yy){
      tidycensus::get_acs(geography, variable, year=yy, survey="acs1") |>
        mutate(year=yy) |>
        select(-moe, -variable) |>
        rename(!!variable := estimate)
    }) |> bind_rows()
    
    write_csv(ALL_DATA, fname)
  }
  
  read_csv(fname, show_col_types=FALSE)
}

# Household income (12 month)
INCOME <- get_acs_all_years("B19013_001") |>
  rename(household_income = B19013_001)

# Monthly rent
RENT <- get_acs_all_years("B25064_001") |>
  rename(monthly_rent = B25064_001)

# Total population
POPULATION <- get_acs_all_years("B01003_001") |>
  rename(population = B01003_001)

# Total number of households
HOUSEHOLDS <- get_acs_all_years("B11001_001") |>
  rename(households = B11001_001)

# Number of new housing units built each year
get_building_permits <- function(start_year = 2009, end_year = 2023){
fname <- glue("housing_units_{start_year}_{end_year}.csv")
fname <- file.path("data", "mp02", fname)

if(!file.exists(fname)){
  HISTORICAL_YEARS <- seq(start_year, 2018)
  
  HISTORICAL_DATA <- map(HISTORICAL_YEARS, function(yy){
    historical_url <- glue("https://www.census.gov/construction/bps/txt/tb3u{yy}.txt")
    
    LINES <- readLines(historical_url)[-c(1:11)]
    
    CBSA_LINES <- str_detect(LINES, "^[[:digit:]]")
    CBSA <- as.integer(str_sub(LINES[CBSA_LINES], 5, 10))
    
    PERMIT_LINES <- str_detect(str_sub(LINES, 48, 53), "[[:digit:]]")
    PERMITS <- as.integer(str_sub(LINES[PERMIT_LINES], 48, 53))
    
    data_frame(CBSA = CBSA,
               new_housing_units_permitted = PERMITS, 
               year = yy)
  }) |> bind_rows()
  
  CURRENT_YEARS <- seq(2019, end_year)
  
  CURRENT_DATA <- map(CURRENT_YEARS, function(yy){
    current_url <- glue("https://www.census.gov/construction/bps/xls/msaannual_{yy}99.xls")
    
    temp <- tempfile()
    
    download.file(current_url, destfile = temp, mode="wb")
    
    fallback <- function(.f1, .f2){
      function(...){
        tryCatch(.f1(...), 
                 error=function(e) .f2(...))
      }
    }
    
    reader <- fallback(read_xlsx, read_xls)
    
    reader(temp, skip=5) |>
      na.omit() |>
      select(CBSA, Total) |>
      mutate(year = yy) |>
      rename(new_housing_units_permitted = Total)
  }) |> bind_rows()
  
  ALL_DATA <- rbind(HISTORICAL_DATA, CURRENT_DATA)
  
  write_csv(ALL_DATA, fname)
  
}

read_csv(fname, show_col_types=FALSE)
}

PERMITS <- get_building_permits()

# Latest NAICS data schema 
library(httr2)
library(rvest)
get_bls_industry_codes <- function(){
  fname <- file.path("data", "mp02", "bls_industry_codes.csv")
  library(dplyr)
  library(tidyr)
  library(readr)
  
  if(!file.exists(fname)){
    
    resp <- request("https://www.bls.gov") |> 
      req_url_path("cew", "classifications", "industry", "industry-titles.htm") |>
      req_headers(`User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:143.0) Gecko/20100101 Firefox/143.0") |> 
      req_error(is_error = \(resp) FALSE) |>
      req_perform()
    
    resp_check_status(resp)
    
    naics_table <- resp_body_html(resp) |>
      html_element("#naics_titles") |> 
      html_table() |>
      mutate(title = str_trim(str_remove(str_remove(`Industry Title`, Code), "NAICS"))) |>
      select(-`Industry Title`) |>
      mutate(depth = if_else(nchar(Code) <= 5, nchar(Code) - 1, NA)) |>
      filter(!is.na(depth))
    
    # These were looked up manually on bls.gov after finding 
    # they were presented as ranges. Since there are only three
    # it was easier to manually handle than to special-case everything else
    naics_missing <- tibble::tribble(
      ~Code, ~title, ~depth, 
      "31", "Manufacturing", 1,
      "32", "Manufacturing", 1,
      "33", "Manufacturing", 1,
      "44", "Retail", 1, 
      "45", "Retail", 1,
      "48", "Transportation and Warehousing", 1, 
      "49", "Transportation and Warehousing", 1
    )
    
    naics_table <- bind_rows(naics_table, naics_missing)
    
    naics_table <- naics_table |> 
      filter(depth == 4) |> 
      rename(level4_title=title) |> 
      mutate(level1_code = str_sub(Code, end=2), 
             level2_code = str_sub(Code, end=3), 
             level3_code = str_sub(Code, end=4)) |>
      left_join(naics_table, join_by(level1_code == Code)) |>
      rename(level1_title=title) |>
      left_join(naics_table, join_by(level2_code == Code)) |>
      rename(level2_title=title) |>
      left_join(naics_table, join_by(level3_code == Code)) |>
      rename(level3_title=title) |>
      select(-starts_with("depth")) |>
      rename(level4_code = Code) |>
      select(level1_title, level2_title, level3_title, level4_title, 
             level1_code,  level2_code,  level3_code,  level4_code) |>
      drop_na() |>
      mutate(across(contains("code"), as.integer))
    
    write_csv(naics_table, fname)
  }
  
  read_csv(fname, show_col_types=FALSE)
}

INDUSTRY_CODES <- get_bls_industry_codes()

# BLS Quarterly Census of Employment and Wages
library(httr2)
library(rvest)
get_bls_qcew_annual_averages <- function(start_year=2009, end_year=2023){
  fname <- glue("bls_qcew_{start_year}_{end_year}.csv.gz")
  fname <- file.path("data", "mp02", fname)
  
  YEARS <- seq(start_year, end_year)
  YEARS <- YEARS[YEARS != 2020] # Drop Covid year to match ACS
  
  if(!file.exists(fname)){
    ALL_DATA <- map(YEARS, .progress=TRUE, possibly(function(yy){
      fname_inner <- file.path("data", "mp02", glue("{yy}_qcew_annual_singlefile.zip"))
      
      if(!file.exists(fname_inner)){
        request("https://www.bls.gov") |> 
          req_url_path("cew", "data", "files", yy, "csv",
                       glue("{yy}_annual_singlefile.zip")) |>
          req_headers(`User-Agent` = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:143.0) Gecko/20100101 Firefox/143.0") |> 
          req_retry(max_tries=5) |>
          req_perform(fname_inner)
      }
      
      if(file.info(fname_inner)$size < 755e5){
        warning(sQuote(fname_inner), "appears corrupted. Please delete and retry this step.")
      }
      
      read_csv(fname_inner, 
               show_col_types=FALSE) |> 
        mutate(YEAR = yy) |>
        select(area_fips, 
               industry_code, 
               annual_avg_emplvl, 
               total_annual_wages, 
               YEAR) |>
        filter(nchar(industry_code) <= 5, 
               str_starts(area_fips, "C")) |>
        filter(str_detect(industry_code, "-", negate=TRUE)) |>
        mutate(FIPS = area_fips, 
               INDUSTRY = as.integer(industry_code), 
               EMPLOYMENT = as.integer(annual_avg_emplvl), 
               TOTAL_WAGES = total_annual_wages) |>
        select(-area_fips, 
               -industry_code, 
               -annual_avg_emplvl, 
               -total_annual_wages) |>
        # 10 is a special value: "all industries" , so omit
        filter(INDUSTRY != 10) |> 
        mutate(AVG_WAGE = TOTAL_WAGES / EMPLOYMENT)
    })) |> bind_rows()
    
    write_csv(ALL_DATA, fname)
  }
  
  ALL_DATA <- read_csv(fname, show_col_types=FALSE)
  
  ALL_DATA_YEARS <- unique(ALL_DATA$YEAR)
  
  YEARS_DIFF <- setdiff(YEARS, ALL_DATA_YEARS)
  
  if(length(YEARS_DIFF) > 0){
    stop("Download failed for the following years: ", YEARS_DIFF, 
         ". Please delete intermediate files and try again.")
  }
  
  ALL_DATA
}

WAGES <- get_bls_qcew_annual_averages()

Pre-Processing

Firstly, examined the structure of six datasets (households, income, rent, population, building permits, and wages) to understand what information each contains.

Then standardized all column names to lowercase and ensured the datasets use consistent location identifiers so they can be combined and compared in later analysis.

Show code

# ---- 1. Explore Loaded Data ----

glimpse(HOUSEHOLDS)

Rows: 7,279
Columns: 4
$ GEOID      <dbl> 10140, 10180, 10300, 10380, 10420, 10500, 10540, 10580, 107…
$ NAME       <chr> "Aberdeen, WA Micro Area", "Abilene, TX Metro Area", "Adria…
$ households <dbl> 27759, 58052, 36835, 91805, 281769, 60101, 43953, 336492, 3…
$ year       <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,…

Show code

glimpse(INCOME)

Rows: 7,279
Columns: 4
$ GEOID            <dbl> 10140, 10180, 10300, 10380, 10420, 10500, 10540, 1058…
$ NAME             <chr> "Aberdeen, WA Micro Area", "Abilene, TX Metro Area", …
$ household_income <dbl> 36345, 42931, 45640, 13470, 47482, 36218, 47669, 5767…
$ year             <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,…

Show code

glimpse(RENT)

Rows: 7,279
Columns: 4
$ GEOID        <dbl> 10140, 10180, 10300, 10380, 10420, 10500, 10540, 10580, 1…
$ NAME         <chr> "Aberdeen, WA Micro Area", "Abilene, TX Metro Area", "Adr…
$ monthly_rent <dbl> 650, 712, 645, 363, 723, 624, 761, 833, 579, 726, 668, 65…
$ year         <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 200…

Show code

glimpse(POPULATION)

Rows: 7,279
Columns: 4
$ GEOID      <dbl> 10140, 10180, 10300, 10380, 10420, 10500, 10540, 10580, 107…
$ NAME       <chr> "Aberdeen, WA Micro Area", "Abilene, TX Metro Area", "Adria…
$ population <dbl> 71797, 160266, 99837, 342495, 699935, 164238, 116584, 85759…
$ year       <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,…

Show code

glimpse(PERMITS)

Rows: 5,658
Columns: 3
$ CBSA                        <dbl> 10180, 10420, 10500, 10580, 10740, 10780, …
$ new_housing_units_permitted <dbl> 214, 741, 213, 1380, 1692, 396, 1648, 125,…
$ year                        <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, …

Show code

glimpse(WAGES)

Rows: 4,442,181
Columns: 6
$ YEAR        <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009…
$ FIPS        <chr> "C1018", "C1018", "C1018", "C1018", "C1018", "C1018", "C10…
$ INDUSTRY    <dbl> 101, 1011, 1012, 1013, 102, 1021, 1022, 1023, 1024, 1025, …
$ EMPLOYMENT  <dbl> 8050, 1769, 3328, 2952, 42334, 11993, 0, 3575, 4697, 11950…
$ TOTAL_WAGES <dbl> 342280951, 86660038, 151822573, 103798340, 1284997543, 368…
$ AVG_WAGE    <dbl> 42519.37, 48988.15, 45619.76, 35162.04, 30353.79, 30764.23…

Show code

# ---- 2. Clean and Standardize Column Names ----

HOUSEHOLDS <- HOUSEHOLDS %>% rename_with(tolower)
INCOME <- INCOME %>% rename_with(tolower)
RENT <- RENT %>% rename_with(tolower)
POPULATION <- POPULATION %>% rename_with(tolower)
PERMITS <- PERMITS %>% rename_with(tolower)
WAGES <- WAGES %>% rename_with(tolower)


# ---- 3. Align join key names ----

PERMITS <- PERMITS %>% rename(geoid = cbsa)

Task 2 – Multi-Table Questions

Please find all answers and code snippets below. Some are 2 parts while oters are single part.

Q1. Which CBSA (by name) permitted the largest number of new housing units in the decade from 2010 to 2019 (inclusive)?

Houston–The Woodlands–Sugar Land, TX permitted the most new housing units during the decade. This highlights Houston’s long-standing permissive zoning and its ability to accommodate strong population inflows.

Show code

permits_decade <- PERMITS %>%
  filter(year >= 2010, year <= 2019) %>%
  group_by(geoid) %>%
  summarise(total_permits_2010_2019 = sum(new_housing_units_permitted, na.rm = TRUE)) %>%
  arrange(desc(total_permits_2010_2019)) %>%
  left_join(select(HOUSEHOLDS, geoid, name), by = "geoid")

head(permits_decade, 5)

Q2. In what year did Albuquerque, NM (CBSA Number 10740) permit the most new housing units?

Albuquerque, NM recorded its highest number of new permits in 2021. The post-pandemic surge suggests short-term recovery rather than sustained, long-run growth.

Show code

albuquerque_peak <- PERMITS %>%
  filter(geoid == 10740, year != 2020) %>%       # exclude 2020 due to COVID data gap
  group_by(year) %>%
  summarise(total_permits = sum(new_housing_units_permitted, na.rm = TRUE)) %>%
  arrange(desc(total_permits))

head(albuquerque_peak, 3)

Q3. Which state (not CBSA) had the highest average individual income in 2015?

Washington, DC ranked highest in per-capita income, closely followed by Massachusetts. Concentrations of government and knowledge-based jobs continue to sustain above-average regional wages.

Show code

# ---- Task 2: Multi-Table Question 3 (part 1) ----
# Computed total income per CBSA and prepare for state-level aggregation

income_state <- INCOME %>%
  filter(year == 2015) %>%                                  # focus on 2015
  left_join(HOUSEHOLDS %>% filter(year == 2015), 
            by = c("geoid", "name", "year")) %>%            # align households
  mutate(total_income_cbsa = household_income * households)  # weighted total

glimpse(income_state)

Rows: 516
Columns: 6
$ geoid             <dbl> 10140, 10180, 10300, 10380, 10420, 10460, 10500, 105…
$ name              <chr> "Aberdeen, WA Micro Area", "Abilene, TX Metro Area",…
$ household_income  <dbl> 44122, 47420, 48279, 14485, 51580, 36142, 40143, 475…
$ year              <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015…
$ households        <dbl> 27430, 59372, 37016, 109586, 282456, 23919, 57597, 4…
$ total_income_cbsa <dbl> 1210266460, 2815420240, 1787095464, 1587353210, 1456…

Show code

# ---- Task 2: Multi-Table Question 3 (part 2) ----
# Extracted principal state abbreviation from CBSA names

income_state <- income_state %>%
  mutate(state = str_extract(name, ", (.{2})")) %>% 
  mutate(state = str_remove(state, ", "))  # clean the comma

state_income_2015 <- income_state %>%
  left_join(POPULATION %>% filter(year == 2015), 
            by = c("geoid", "name", "year")) %>%
  group_by(state) %>%
  summarise(
    total_income_state = sum(total_income_cbsa, na.rm = TRUE),
    total_population_state = sum(population, na.rm = TRUE),
    avg_individual_income = total_income_state / total_population_state
  ) %>%
  arrange(desc(avg_individual_income))

head(state_income_2015, 5)

Q4. What is the last year in which the NYC CBSA had the most data scientists in the country?

New York City last led national employment in NAICS 5182 (data analytics) in 2015, before San Francisco overtook it. This shift illustrates how the nation’s tech-talent hub has migrated westward over the last decade.

Show code

# ---- Task 2: Multi-Table Question 4 (part 1) ----
# Converted BLS FIPS codes (e.g., "C1074") to numeric GEOID format (e.g., 10740)

WAGES <- WAGES %>%
  mutate(
    geoid_bls = fips %>%
      str_remove("C") %>%       # remove the "C" prefix
      as.double() * 10          # multiply by 10 to match Census GEOID format
  )
# ---- Task 2: Multi-Table Question 4 (part 2) ----
# Identify when NYC last had the highest number of data scientists (NAICS 5182)

data_scientists <- WAGES %>%
  filter(industry == 5182) %>%               # focus on data science/analytics
  group_by(geoid_bls, year) %>%
  summarise(total_employment = sum(employment, na.rm = TRUE)) %>%
  arrange(year, desc(total_employment))

# Find which CBSA leads per year
leaders_by_year <- data_scientists %>%
  group_by(year) %>%
  slice_max(total_employment, n = 1)

# Inspect NYC pattern
leaders_by_year %>%
  filter(geoid_bls == 35620) %>%
  arrange(desc(year)) %>%
  head(10)

Q5.What fraction of total wages in the NYC CBSA was earned by people employed in the finance and insurance industries (NAICS code 52)? In what year did this fraction peak?

Finance & Insurance wages peaked around 4.6 percent of total wages in 2014. The sector remains dominant but relatively stable, while employment growth has diversified into other industries.

Show code

finance_share_nyc <- WAGES %>%
  filter(geoid_bls == 35620) %>%
  group_by(year) %>%
  summarise(
    total_wages_all = sum(total_wages, na.rm = TRUE),
    total_wages_finance = sum(total_wages[industry == 52], na.rm = TRUE)
  ) %>%
  mutate(finance_share = total_wages_finance / total_wages_all) %>%
  arrange(desc(finance_share))

head(finance_share_nyc, 5)

Task 3 - Initial Visualizations

Plot 1 – Monthly Rent vs Household Income (2009)

The scatterplot shows a strong positive linear relationship between median household income and median monthly rent across CBSAs. Higher-income regions tend to have proportionally higher rents, reflecting income-linked housing markets. The regression line reinforces a consistent affordability gradient nationwide.

Show code

library(ggplot2)
library(scales)
library(dplyr)

rent_income_2009 <- INCOME %>%
  filter(year == 2009) %>%
  select(geoid, name, household_income) %>%
  inner_join(
    RENT %>% filter(year == 2009) %>% select(geoid, monthly_rent),
    by = "geoid"
  ) %>%
  filter(!is.na(household_income), !is.na(monthly_rent))

p_rent_income_2009 <- ggplot(rent_income_2009,
                             aes(x = household_income, y = monthly_rent)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 0.7) +
  labs(
    title = "Monthly Rent vs. Household Income by CBSA (2009)",
    subtitle = "ACS 1-year estimates; each point is a CBSA",
    x = "Median Household Income (USD)",
    y = "Median Monthly Rent (USD)",
    caption = "Source: U.S. Census Bureau, ACS 1-year (2009)"
  ) +
  scale_x_continuous(labels = label_dollar(accuracy = 1)) +
  scale_y_continuous(labels = label_dollar(accuracy = 1)) +
  theme_bw(base_size = 12)

print(p_rent_income_2009)

Plot 2 – Health-Sector Employment vs Total Employment (2009–2023)

Across all periods, CBSAs with larger overall workforces also support greater employment in health care and social assistance. The share of health-sector jobs rises steadily over time, indicating the sector’s expanding importance within regional labor markets. The facet panels reveal both growth and resilience in health-care employment, even through economic fluctuations.

Show code

library(ggplot2)
library(scales)
library(dplyr)
library(stringr)

# 1) Total employment per CBSA-year (all industries)
total_emp <- WAGES %>%
  group_by(geoid_bls, year) %>%
  summarise(total_emp = sum(employment, na.rm = TRUE), .groups = "drop")

# 2) Health sector (NAICS 62*) employment per CBSA-year
# Including all industries whose NAICS code starts with "62" (62, 621, 622, 623, 624, ...)
health_emp <- WAGES %>%
  mutate(industry_chr = as.character(industry)) %>%
  filter(str_starts(industry_chr, "62")) %>%
  group_by(geoid_bls, year) %>%
  summarise(health_emp = sum(employment, na.rm = TRUE), .groups = "drop")

# 3) Joined and created coarse time periods to make evolution readable
emp_df <- total_emp %>%
  inner_join(health_emp, by = c("geoid_bls", "year")) %>%
  filter(!is.na(total_emp), !is.na(health_emp)) %>%
  mutate(period = case_when(
    year <= 2012 ~ "2009–2012",
    year <= 2016 ~ "2013–2016",
    TRUE         ~ "2017–2023"
  )) %>%
  mutate(period = factor(period, levels = c("2009–2012","2013–2016","2017–2023")))

# 4) Final plot (small multiples to show evolution)
p_emp_health <- ggplot(emp_df, aes(x = total_emp, y = health_emp)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~ period) +
  labs(
    title    = "Health-Sector Employment vs Total Employment by CBSA",
    subtitle = "Evolution over time shown via small multiples (NAICS 62*)",
    x        = "Total Employment (All Industries)",
    y        = "Employment in Health Care & Social Assistance (NAICS 62*)",
    caption  = "Source: BLS QCEW; points are CBSA-years; facets indicate periods"
  ) +
  scale_x_continuous(labels = scales::label_number(scale_cut = scales::cut_short_scale())) +
  scale_y_continuous(labels = scales::label_number(scale_cut = scales::cut_short_scale())) +
  theme_bw(base_size = 12)

print(p_emp_health)

Plot 3 – Average Household Size Over Time (2009–2023)

Most CBSAs maintain stable household sizes near 2.5–3 people, though minor variations persist across metro areas. Los Angeles shows slightly larger average households, while New York’s trend remains flatter and slightly lower. Overall, stability suggests slow demographic shifts despite population growth and housing pressures.

Show code

# Packages
library(ggplot2)
library(dplyr)
if (!requireNamespace("gghighlight", quietly = TRUE)) install.packages("gghighlight")
library(gghighlight)

# Compute average household size: population / households
household_size <- POPULATION %>%
  select(geoid, name, year, population) %>%
  inner_join(HOUSEHOLDS %>% select(geoid, year, households),
             by = c("geoid", "year")) %>%
  mutate(avg_household_size = population / households) %>%
  filter(year >= 2009, year != 2020)                       # omit COVID gap

# Target CBSAs to highlight
highlight_targets <- c(
  "New York-Newark-Jersey City, NY-NJ-PA Metro Area",
  "Los Angeles-Long Beach-Anaheim, CA Metro Area"
)

# Spaghetti plot with highlights and direct labels
p_household_size <- ggplot(household_size,
                           aes(x = year, y = avg_household_size,
                               group = name, color = name)) +
  geom_line(alpha = 0.35, linewidth = 0.6) +
  gghighlight(name %in% highlight_targets,
              label_key = name,
              use_direct_label = TRUE,
              label_params = list(size = 3, fontface = "bold")) +
  labs(
    title    = "Average Household Size Over Time by CBSA (2009–2023)",
    x        = "Year",
    y        = "Average Household Size",
    caption  = "Source: U.S. Census Bureau, ACS 1-year; 2020 omitted"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

print(p_household_size)

Task 4: Rent Burden

Key Findings

To evaluate housing affordability, I developed a population-weighted Rent Burden Index that scales the rent-to-income ratio so that 100 represents the national weighted average. CBSAs with values below 100 spend proportionally less of their income on rent, while those above 100 face greater housing cost pressure.

In 2023, the least-burdened CBSAs, such as Albertville, AL Micro Area and Mount Airy, NC Micro Area, registered index values near 70, with rent-to-income ratios around 0.14 — meaning typical residents spent only 14 % of income on rent, well below the national benchmark. By contrast, the most-burdened CBSAs exceeded 130 on the index, underscoring how larger metros continue to struggle with sustained rent pressure even as smaller markets remain relatively affordable.

Show code

# ---- Task 4: Rent Burden Metric ----

# 1) Joined RENT, INCOME, POPULATION 
rent_burden_weighted <- RENT %>%
  inner_join(INCOME,    by = c("geoid", "name", "year")) %>%
  inner_join(POPULATION, by = c("geoid", "name", "year")) %>%
  filter(!is.na(monthly_rent), !is.na(household_income), household_income > 0) %>%
  mutate(
    rent_to_income = (monthly_rent * 12) / household_income
  )

# 2) Population-weighted baseline (100 = national average)
baseline_rent_ratio_pop <- with(
  rent_burden_weighted,
  weighted.mean(rent_to_income, w = population, na.rm = TRUE)
)

# 3) Standardized index (weighted)
rent_burden_weighted <- rent_burden_weighted %>%
  mutate(
    rent_burden_index_weighted = (rent_to_income / baseline_rent_ratio_pop) * 100
  )

# 4) Compared to unweighted baseline if available
if (exists("baseline_rent_ratio")) {
  compare_baselines <- data.frame(
    baseline_unweighted = baseline_rent_ratio,
    baseline_weighted   = baseline_rent_ratio_pop
  )
  print(compare_baselines)
} else {
  message("Unweighted baseline not found in this session; skipping baseline comparison.")
}

# 5) Latest year weighted rankings
latest_year <- max(rent_burden_weighted$year, na.rm = TRUE)

rent_burden_latest_weighted <- rent_burden_weighted %>%
  filter(year == latest_year) %>%
  distinct(name, .keep_all = TRUE) %>%  # ensure one row per CBSA
  select(name, rent_burden_index_weighted) %>%
  arrange(desc(rent_burden_index_weighted))

top10_burdened_weighted    <- head(rent_burden_latest_weighted, 10)
bottom10_burdened_weighted <- tail(rent_burden_latest_weighted, 10)

cat("\nTop 10 Most Burdened CBSAs (Weighted):\n")


Top 10 Most Burdened CBSAs (Weighted):

Show code

print(top10_burdened_weighted)

# A tibble: 10 × 2
   name                                                 rent_burden_index_weig…¹
   <chr>                                                                   <dbl>
 1 Clearlake, CA Micro Area                                                 151.
 2 Aguadilla, PR Metro Area                                                 150.
 3 Cape Coral-Fort Myers, FL Metro Area                                     146.
 4 Miami-Fort Lauderdale-West Palm Beach, FL Metro Area                     146.
 5 Port St. Lucie, FL Metro Area                                            143.
 6 Ponce, PR Metro Area                                                     139.
 7 Tampa-St. Petersburg-Clearwater, FL Metro Area                           138.
 8 Key West-Key Largo, FL Micro Area                                        137.
 9 North Port-Bradenton-Sarasota, FL Metro Area                             137.
10 Ocala, FL Metro Area                                                     136.
# ℹ abbreviated name: ¹rent_burden_index_weighted

Show code

cat("\nBottom 10 Least Burdened CBSAs (Weighted):\n")


Bottom 10 Least Burdened CBSAs (Weighted):

Show code

print(bottom10_burdened_weighted)

# A tibble: 10 × 2
   name                                       rent_burden_index_weighted
   <chr>                                                           <dbl>
 1 Albertville, AL Micro Area                                       69.9
 2 Mount Airy, NC Micro Area                                        69.6
 3 Talladega-Sylacauga, AL Micro Area                               68.9
 4 Decatur, AL Metro Area                                           68.1
 5 Jefferson City, MO Metro Area                                    67.1
 6 Wisconsin Rapids-Marshfield, WI Micro Area                       67.1
 7 Watertown-Fort Atkinson, WI Micro Area                           66.9
 8 Bismarck, ND Metro Area                                          66.3
 9 Manitowoc, WI Micro Area                                         64.1
10 Laconia, NH Micro Area                                           61.8

Show code

# 6) Quick peek at a few high-burden CBSAs (weighted)
compare_sample <- rent_burden_weighted %>%
  filter(year == latest_year) %>%
  select(name, rent_burden_index_weighted) %>%
  arrange(desc(rent_burden_index_weighted)) %>%
  head(10)

Task 4 - Metro pick: Pittsburgh

I chose Pittsburgh, PA Metro Area because it was the first place I called home after moving to the U.S., and it still feels like the one city I could always see myself returning to. I’ve thought about moving back a few times — though, truthfully, the food in Jackson Heights might be the only thing keeping me rooted in NYC right now.

Looking at the numbers, Pittsburgh’s story validates that instinct.

From 2009 to 2018, its population-weighted Rent Burden Index consistently stayed below the national benchmark (≈80–82 vs. 100), meaning the typical resident spent roughly 16–17% of income on rent — well under the national average.

While rents have risen modestly, income growth has largely kept pace, preserving affordability and making Pittsburgh a quiet YIMBY success story in contrast to the escalating rent pressures of larger coastal metros

Show code

# ---- Task 4 Tables: DT outputs for trend + top/bottom ----
suppressPackageStartupMessages({ library(dplyr); library(DT) })

# Pick ONE metro to showcase the time trend (change this string as needed)
METRO_PICK <- "Pittsburgh, PA Metro Area"

# A) Trend table for selected metro
rent_burden_trend_dt <- rent_burden_weighted %>%
  filter(name == METRO_PICK) %>%
  arrange(year) %>%
  transmute(
    year,
    monthly_rent              = round(monthly_rent, 0),
    household_income          = round(household_income, 0),
    rent_to_income            = round(rent_to_income, 3),
    rent_burden_index_weighted= round(rent_burden_index_weighted, 1)
  )

# B) Latest-year top & bottom CBSAs by weighted index
latest_year_rb <- max(rent_burden_weighted$year, na.rm = TRUE)

rent_burden_latest_w <- rent_burden_weighted %>%
  filter(year == latest_year_rb) %>%
  distinct(name, .keep_all = TRUE) %>%
  transmute(
    name,
    rent_burden_index_weighted = round(rent_burden_index_weighted, 1),
    rent_to_income = round(rent_to_income, 3)
  ) %>%
  arrange(desc(rent_burden_index_weighted))

top10_rent_burden_w    <- head(rent_burden_latest_w, 10)
bottom10_rent_burden_w <- tail(rent_burden_latest_w, 10)

# ---- DT render (interactive; shows in Viewer/HTML) ----
if (interactive()) {
  DT::datatable(
    rent_burden_trend_dt,
    caption = paste("Rent Burden Trend —", METRO_PICK),
    options = list(pageLength = 12, dom = "tip"),
    rownames = FALSE
  )
  
  DT::datatable(
    top10_rent_burden_w,
    caption = paste0("Top 10 Most Burdened CBSAs (Weighted, ", latest_year_rb, ")"),
    options = list(pageLength = 10, dom = "tip"),
    rownames = FALSE
  )
  
  DT::datatable(
    bottom10_rent_burden_w,
    caption = paste0("Bottom 10 Least Burdened CBSAs (Weighted, ", latest_year_rb, ")"),
    options = list(pageLength = 10, dom = "tip"),
    rownames = FALSE
  )
}

# Console fallbacks (so you still see something when knitting to PDF)
cat("\nTrend (first 10 rows) —", METRO_PICK, ":\n"); print(head(rent_burden_trend_dt, 10))


Trend (first 10 rows) — Pittsburgh, PA Metro Area :

# A tibble: 10 × 5
    year monthly_rent household_income rent_to_income rent_burden_index_weighted
   <dbl>        <dbl>            <dbl>          <dbl>                      <dbl>
 1  2009          643            46349          0.166                       80.7
 2  2010          656            46700          0.169                       81.7
 3  2011          682            48854          0.168                       81.2
 4  2012          700            50489          0.166                       80.7
 5  2013          712            51291          0.167                       80.8
 6  2014          743            52293          0.171                       82.7
 7  2015          756            54080          0.168                       81.3
 8  2016          766            56063          0.164                       79.5
 9  2017          794            58521          0.163                       78.9
10  2018          832            59710          0.167                       81.1

Task 5: Housing Growth Index

To capture how actively regions expand their housing supply, I constructed two complementary metrics:

An Instantaneous Housing Growth Index measuring the number of housing units permitted per 1,000 residents, and
A Rate-Based Index comparing new permits to population growth over a rolling five-year window. Both are standardized so that 100 = the national average, making the comparison intuitive across CBSAs.

In 2023, Salisbury, MD and Myrtle Beach, SC topped the instantaneous index with values ≈ 900–1,000—nearly 10 times the national rate—reflecting extraordinary post-pandemic expansion along the Southeast corridor. Conversely, Wheeling, WV–OH and Danville, IL recorded single-digit scores, highlighting stagnation in smaller Rust-Belt metros. When combining the two measures into a Composite Housing Growth Index, the highest performers clustered in Florida and Texas (e.g., Cape Coral–Fort Myers and Austin–Round Rock), while the lowest were concentrated in older industrial areas of the Midwest and Appalachia.

Together, these findings illustrate the uneven geography of U.S. homebuilding: rapid suburban expansion in the Sun Belt versus minimal permit activity in legacy metros—an imbalance that continues to shape affordability and migration patterns.

Show code

# ---- Task 5: Base join + 5-year population lookback ----
library(dplyr)

# Joined POPULATION and PERMITS, per CBSA-year
housing_base <- POPULATION %>%
  select(geoid, name, year, population) %>%
  inner_join(
    PERMITS %>% select(geoid, year, new_housing_units_permitted),
    by = c("geoid", "year")
  ) %>%
  arrange(geoid, year) %>%
  group_by(geoid) %>%
  # 5 observed-years lookback (note: 2020 is absent in the data, so lag(5) still spans ~5 years)
  mutate(
    pop_lag5       = dplyr::lag(population, 5),
    pop_growth_5y  = population - pop_lag5,                  # absolute growth over ~5 years
    pop_growth_pct = ifelse(!is.na(pop_lag5) & pop_lag5 > 0,
                            (population / pop_lag5) - 1, NA)  # percentage growth over ~5 years
  ) %>%
  ungroup()

# quick check
housing_base %>% 
  select(geoid, name, year, population, new_housing_units_permitted, pop_lag5, pop_growth_5y, pop_growth_pct) %>%
  head(12)

Show code

# ---- Task 5a: Instantaneous Housing Growth Index ----
housing_instantaneous <- housing_base %>%
  mutate(permits_per_1000 = (new_housing_units_permitted / population) * 1000)

baseline_inst <- mean(housing_instantaneous$permits_per_1000, na.rm = TRUE)

housing_instantaneous <- housing_instantaneous %>%
  mutate(housing_growth_index_instantaneous = (permits_per_1000 / baseline_inst) * 100)

# preview top/bottom CBSAs (latest year)
latest_year_inst <- max(housing_instantaneous$year, na.rm = TRUE)

top10_growth <- housing_instantaneous %>%
  filter(year == latest_year_inst) %>%
  arrange(desc(housing_growth_index_instantaneous)) %>%
  select(name, housing_growth_index_instantaneous) %>%
  head(10)

bottom10_growth <- housing_instantaneous %>%
  filter(year == latest_year_inst) %>%
  arrange(housing_growth_index_instantaneous) %>%
  select(name, housing_growth_index_instantaneous) %>%
  head(10)

cat("Top 10 CBSAs (Instantaneous Housing Growth):\n")

Top 10 CBSAs (Instantaneous Housing Growth):

Show code

print(top10_growth)

# A tibble: 10 × 2
   name                                                  housing_growth_index_…¹
   <chr>                                                                   <dbl>
 1 Salisbury, MD Metro Area                                                1022.
 2 Myrtle Beach-Conway-North Myrtle Beach, SC Metro Area                    898.
 3 Punta Gorda, FL Metro Area                                               582.
 4 Crestview-Fort Walton Beach-Destin, FL Metro Area                        497.
 5 North Port-Bradenton-Sarasota, FL Metro Area                             474.
 6 Daphne-Fairhope-Foley, AL Metro Area                                     466.
 7 Sherman-Denison, TX Metro Area                                           451.
 8 Cape Coral-Fort Myers, FL Metro Area                                     440.
 9 Austin-Round Rock-San Marcos, TX Metro Area                              424.
10 Lakeland-Winter Haven, FL Metro Area                                     414.
# ℹ abbreviated name: ¹housing_growth_index_instantaneous

Show code

cat("\nBottom 10 CBSAs:\n")


Bottom 10 CBSAs:

Show code

print(bottom10_growth)

# A tibble: 10 × 2
   name                                   housing_growth_index_instantaneous
   <chr>                                                               <dbl>
 1 Wheeling, WV-OH Metro Area                                           2.20
 2 Danville, IL Micro Area                                              3.02
 3 Morgantown, WV Metro Area                                            3.25
 4 Weirton-Steubenville, WV-OH Metro Area                               3.80
 5 Enid, OK Metro Area                                                  4.37
 6 Decatur, IL Metro Area                                               9.15
 7 Johnstown, PA Metro Area                                             9.74
 8 Bay City, MI Metro Area                                             10.0 
 9 Williamsport, PA Metro Area                                         13.7 
10 Fairbanks-College, AK Metro Area                                    14.0

Show code

# ---- Task 5b: Rate-Based Housing Growth (national baseline = 100) ----
housing_rate <- housing_base %>%
  filter(!is.na(pop_growth_5y), pop_growth_5y > 0) %>%
  mutate(
    permits_per_growth = new_housing_units_permitted / pop_growth_5y
  )

baseline_rate <- mean(housing_rate$permits_per_growth, na.rm = TRUE)

housing_rate <- housing_rate %>%
  mutate(
    housing_growth_index_rate = (permits_per_growth / baseline_rate) * 100
  )

# Rank CBSAs by latest available year
latest_year_rate <- max(housing_rate$year, na.rm = TRUE)

top10_rate <- housing_rate %>%
  filter(year == latest_year_rate) %>%
  arrange(desc(housing_growth_index_rate)) %>%
  select(name, housing_growth_index_rate) %>%
  head(10)

bottom10_rate <- housing_rate %>%
  filter(year == latest_year_rate) %>%
  arrange(housing_growth_index_rate) %>%
  select(name, housing_growth_index_rate) %>%
  head(10)

cat("Top 10 CBSAs (Rate-Based Housing Growth):\n")

Top 10 CBSAs (Rate-Based Housing Growth):

Show code

print(top10_rate)

# A tibble: 10 × 2
   name                                                 housing_growth_index_r…¹
   <chr>                                                                   <dbl>
 1 Springfield, OH Metro Area                                              1549.
 2 Urban Honolulu, HI Metro Area                                            932.
 3 Dalton, GA Metro Area                                                    672.
 4 Brunswick-St. Simons, GA Metro Area                                      489.
 5 Anchorage, AK Metro Area                                                 410.
 6 Racine-Mount Pleasant, WI Metro Area                                     369.
 7 Bridgeport-Stamford-Danbury, CT Metro Area                               356.
 8 Miami-Fort Lauderdale-West Palm Beach, FL Metro Area                     334.
 9 Hickory-Lenoir-Morganton, NC Metro Area                                  260.
10 Brownsville-Harlingen, TX Metro Area                                     250.
# ℹ abbreviated name: ¹housing_growth_index_rate

Show code

cat("\nBottom 10 CBSAs:\n")


Bottom 10 CBSAs:

Show code

print(bottom10_rate)

# A tibble: 10 × 2
   name                                   housing_growth_index_rate
   <chr>                                                      <dbl>
 1 Atlantic City-Hammonton, NJ Metro Area                      1.46
 2 Longview, TX Metro Area                                     1.83
 3 Morgantown, WV Metro Area                                   2.09
 4 Manhattan, KS Metro Area                                    2.82
 5 Jackson, TN Metro Area                                      3.15
 6 Ames, IA Metro Area                                         3.62
 7 Monroe, LA Metro Area                                       3.63
 8 La Crosse-Onalaska, WI-MN Metro Area                        5.41
 9 Fresno, CA Metro Area                                       6.31
10 Billings, MT Metro Area                                     6.89

Show code

# ---- Task 5c: Composite Housing Growth Index ----
housing_composite <- housing_base %>%
  left_join(
    housing_instantaneous %>% select(geoid, year, housing_growth_index_instantaneous),
    by = c("geoid", "year")
  ) %>%
  left_join(
    housing_rate %>% select(geoid, year, housing_growth_index_rate),
    by = c("geoid", "year")
  ) %>%
  mutate(
    composite_housing_growth_index =
      0.5 * housing_growth_index_instantaneous +
      0.5 * housing_growth_index_rate
  )

# ---- Task 5d: Rolling 5-Year Average (Smooth Composite) ----
# Homebuilding is slow—so smooth yearly volatility for a clearer long-term signal.

library(RcppRoll)

housing_composite <- housing_composite %>%
  group_by(geoid) %>%
  arrange(year, .by_group = TRUE) %>%
  mutate(
    composite_5yr_avg = roll_meanr(composite_housing_growth_index, n = 5, fill = NA)
  )

# ---- Task 5e: Identify High / Low CBSAs (Latest Year Available) ----

latest_year_comp <- max(housing_composite$year, na.rm = TRUE)

top10_composite <- housing_composite %>%
  filter(year == latest_year_comp) %>%
  arrange(desc(composite_5yr_avg)) %>%
  select(name, composite_5yr_avg) %>%
  head(10)

bottom10_composite <- housing_composite %>%
  filter(year == latest_year_comp) %>%
  arrange(composite_5yr_avg) %>%
  select(name, composite_5yr_avg) %>%
  head(10)

cat("Top 10 CBSAs (Composite Housing Growth, 5-Year Rolling):\n")

Top 10 CBSAs (Composite Housing Growth, 5-Year Rolling):

Show code

print(top10_composite)

# A tibble: 10 × 3
# Groups:   geoid [10]
   geoid name                                          composite_5yr_avg
   <dbl> <chr>                                                     <dbl>
 1 48140 Wausau, WI Metro Area                                     1036.
 2 10580 Albany-Schenectady-Troy, NY Metro Area                     398.
 3 31140 Louisville/Jefferson County, KY-IN Metro Area              391.
 4 33780 Monroe, MI Metro Area                                      350.
 5 39460 Punta Gorda, FL Metro Area                                 297.
 6 41100 St. George, UT Metro Area                                  250.
 7 12420 Austin-Round Rock-San Marcos, TX Metro Area                247.
 8 15180 Brownsville-Harlingen, TX Metro Area                       242.
 9 35840 North Port-Bradenton-Sarasota, FL Metro Area               239.
10 15980 Cape Coral-Fort Myers, FL Metro Area                       226.

Show code

cat("\nBottom 10 CBSAs:\n")


Bottom 10 CBSAs:

Show code

print(bottom10_composite)

# A tibble: 10 × 3
# Groups:   geoid [10]
   geoid name                                         composite_5yr_avg
   <dbl> <chr>                                                    <dbl>
 1 34060 Morgantown, WV Metro Area                                 10.6
 2 25620 Hattiesburg, MS Metro Area                                21.9
 3 39740 Reading, PA Metro Area                                    25.5
 4 30980 Longview, TX Metro Area                                   26.0
 5 39300 Providence-Warwick, RI-MA Metro Area                      30.5
 6 26300 Hot Springs, AR Metro Area                                31.8
 7 45460 Terre Haute, IN Metro Area                                33.6
 8 33700 Modesto, CA Metro Area                                    34.6
 9 10900 Allentown-Bethlehem-Easton, PA-NJ Metro Area              42.8
10 32900 Merced, CA Metro Area                                     45.2

Task 6: Relationship Between Rent Burden and Housing Growth

Plot A – Housing Growth vs. Change in Rent Burden

This scatterplot compares five-year housing growth against the change in rent burden from 2009–2011 to 2023.

CBSAs in the upper-left quadrant—high growth with falling rent burden—represent clear YIMBY successes such as Austin and Sarasota, where permitting kept pace with demand and improved affordability.

By contrast, the lower-right quadrant signals NIMBY pressure, where weak construction coincides with rising rent costs, notably in legacy metros like New York and Los Angeles.

Overall, areas with sustained population growth and strong permitting activity tend to experience slower rent escalation, suggesting supply responsiveness helps moderate housing costs.

Show code

#Visualizing Rent Burden vs Housing Growth
library(dplyr)
library(ggplot2)
library(stringr)
library(scales)

early_years <- 2009:2011
latest_year_all <- min(
  max(rent_burden_weighted$year, na.rm = TRUE),
  max(housing_composite$year, na.rm = TRUE)
)

rb_early <- rent_burden_weighted %>%
  filter(year %in% early_years) %>%
  group_by(geoid, name) %>%
  summarise(rent_burden_early = mean(rent_burden_index_weighted, na.rm = TRUE), .groups = "drop")

rb_latest <- rent_burden_weighted %>%
  filter(year == latest_year_all) %>%
  select(geoid, name, rent_burden_latest = rent_burden_index_weighted)

pop_growth_total <- POPULATION %>%
  group_by(geoid, name) %>%
  summarise(
    pop_2009 = population[year == 2009][1],
    pop_latest = population[year == latest_year_all][1],
    .groups = "drop"
  ) %>%
  mutate(
    pop_growth_abs = pop_latest - pop_2009,
    pop_growth_pct = ifelse(!is.na(pop_2009) & pop_2009 > 0,
                            (pop_latest / pop_2009) - 1, NA_real_)
  )

growth_latest <- housing_composite %>%
  filter(year == latest_year_all) %>%
  select(geoid, name, composite_5yr_avg)

task6_summary <- rb_early %>%
  inner_join(rb_latest, by = c("geoid","name")) %>%
  inner_join(pop_growth_total, by = c("geoid","name")) %>%
  inner_join(growth_latest, by = c("geoid","name")) %>%
  mutate(
    rent_burden_change = rent_burden_latest - rent_burden_early,
    grew_population = pop_growth_abs > 0
  )

# Plot A: Housing Growth vs Change in Rent Burden (polished)
plot_a <- ggplot(task6_summary,
                 aes(x = rent_burden_change, y = composite_5yr_avg,
                     shape = grew_population)) +
  # Baselines
  geom_hline(yintercept = 100, linewidth = 0.3) +
  geom_vline(xintercept = 0,   linewidth = 0.3) +
  # Points
  geom_point(alpha = 0.7) +
  scale_shape_discrete(name = "Population grew") +
  # Labels
  labs(
    title = "Housing Growth vs. Change in Rent Burden",
    subtitle = paste0("Early baseline: ", min(early_years), "–", max(early_years),
                      "; Latest year: ", latest_year_all),
    x = "Change in Rent Burden Index (Latest − Early)  [< 0 = improved affordability]",
    y = "Composite Housing Growth Index (5-year rolling)  [100 = national avg]"
  ) +
  # Focus the y-range so most CBSAs are readable; adjust if needed
  coord_cartesian(ylim = c(0, 400)) +
  scale_y_continuous(labels = function(x) scales::number(x, accuracy = 1)) +
  # Quadrant annotations
  annotate("text", x = -Inf, y = Inf, hjust = -0.05, vjust = 1.2,
           label = "↑ Growth, ↓ Burden (YIMBY success)", size = 3.5) +
  annotate("text", x =  Inf, y =  0,  hjust = 1.05, vjust = -0.6,
           label = "↓ Growth, ↑ Burden (NIMBY pressure)", size = 3.5) +
  theme_bw(base_size = 12)

plot_a

Plot A – Housing Growth vs. Change in Rent Burden

This scatterplot compares five-year housing growth against the change in rent burden from 2009–2011 to 2023.

By contrast, the lower-right quadrant signals NIMBY pressure, where weak construction coincides with rising rent costs, notably in legacy metros like New York and Los Angeles.

Overall, areas with sustained population growth and strong permitting activity tend to experience slower rent escalation, suggesting supply responsiveness helps moderate housing costs.

Show code

# Plot B: Rent Burden Index Over Time — Selected CBSAs (polished)
focus_cbsa <- c(
  "New York-Newark-Jersey City, NY-NJ-PA Metro Area",
  "Los Angeles-Long Beach-Anaheim, CA Metro Area",
  "Miami-Fort Lauderdale-West Palm Beach, FL Metro Area",
  "Pittsburgh, PA Metro Area",
  "Austin-Round Rock-San Marcos, TX Metro Area"
)

rb_traj <- rent_burden_weighted %>%
  filter(name %in% focus_cbsa) %>%
  arrange(year)

plot_b <- ggplot(rb_traj, aes(x = year, y = rent_burden_index_weighted, color = name)) +
  geom_hline(yintercept = 100, linewidth = 0.3) +   # national average reference
  geom_line(linewidth = 1) +
  labs(
    title = "Rent Burden Index Over Time — Selected CBSAs",
    subtitle = "Index scaled so 100 = population-weighted national average",
    x = "Year", y = "Rent Burden Index (weighted)"
  ) +
  guides(color = guide_legend(title = "CBSA")) +
  theme_minimal(base_size = 13)

plot_b

Task 7 — Policy Brief: Making Backyards Affordable for All

Executive Summary

The Making Backyards Affordable Act encourages local governments to expand housing supply through measurable YIMBY performance incentives. By tying federal grants to proven indicators of affordability and housing production, this legislation aligns national resources with local success.

Across the U.S., rent burdens have risen faster than wages, particularly in metros that restrict development. Our analysis uses two standardized indices—Rent Burden Index (RBI) and Housing Growth Index (HGI)—to identify where affordability has improved and where it has stagnated.

Proposed Sponsors

Role	City (Representative Base)	Why Selected
Primary Sponsor (YIMBY Success)	Austin–Round Rock–San Marcos, TX Metro Area	Rapid population growth and sustained permitting activity (HGI > 240), yet rent burden near or below the national average (≈ 100). Represents measurable YIMBY success.
Co-Sponsor (Needs Federal Incentives)	New York–Newark–Jersey City, NY-NJ-PA Metro Area	Persistent rent pressure (RBI ≈ 110 – 115) and limited relative housing expansion (HGI ≈ 100). A prime candidate for capacity-building incentives.

Coalition Strategy: Labor & Industry Partners

Occupation / Union	Presence & Stake	How the Bill Benefits Them
Construction & Skilled Trades	High employment in both metros; cyclical exposure to housing cycles.	Sustained federal funding for local permitting ensures stable union jobs and predictable project pipelines.
Service & Hospitality Workers	Large renter population; concentrated in urban cores.	Lower rent burden frees disposable income → increased local spending, greater shift stability, and tip reliability.

These sectors provide strong, bipartisan anchors for local support: they link affordability relief (workers as renters) and growth (workers as builders).

Metrics for Accountability (Non-Technical Summary)

Rent Burden Index (RBI)
Proportion of median income spent on rent, standardized so 100 = national average.
Higher = less affordable; falling RBI = improvement.
Housing Growth Index (HGI)
Number of new housing units permitted per 1,000 residents, standardized so 100 = national average.
Includes a 5-year rolling composite to smooth volatility and highlight sustained growth.

These transparent indices allow Congress to reward measurable progress and prevent one-year anomalies from driving funding decisions.

Legislative Pitch

Cities that grow housing faster than population without worsening affordability deserve continued investment.
Under the proposed Act, metros like Austin would qualify for “High-Performance YIMBY” grants, while metros like New York City could access Reform Incentive Funds to modernize zoning and streamline permitting.

By linking evidence-based metrics to clear funding tiers (e.g., HGI > 125 and ΔRBI < 0), the bill rewards outcomes, not ideology.

Closing Appeal

Affordable housing and strong labor markets are mutually reinforcing.
This program lets Congress champion both—helping growing metros build responsibly and high-cost metros regain affordability—while giving voters a simple, data-driven way to see progress in their communities.