# STEP 2: Load 311 noise complaints
noise_local_file <- "data/noise_311_2018_2024.rds"
noise_base_url <- "https://data.cityofnewyork.us/resource/erm2-nwe9.json"
if (file.exists(noise_local_file)) {
message("Loading 311 data from local file...")
noise_all <- readRDS(noise_local_file)
message("Loaded ", nrow(noise_all), " records")
} else {
message("Fetching 311 data from API (2018-2024)... (this can take 2–3 minutes)")
noise_all <- tibble()
periods <- list()
# Define quarterly periods for API chunking
for (year in 2018:2024) {
periods <- append(periods, list(
list(start = paste0(year, "-01-01"), end = paste0(year, "-04-01"), label = paste0(year, " Q1")),
list(start = paste0(year, "-04-01"), end = paste0(year, "-07-01"), label = paste0(year, " Q2")),
list(start = paste0(year, "-07-01"), end = paste0(year, "-10-01"), label = paste0(year, " Q3")),
list(start = paste0(year, "-10-01"), end = paste0(year + 1, "-01-01"), label = paste0(year, " Q4"))
))
}
# Fetch each quarter
for (period in periods) {
message(" Fetching: ", period$label)
offset <- 0
period_data <- tibble()
repeat {
query <- list(
"$select" = "created_date,complaint_type,descriptor,location_type,borough,incident_zip,latitude,longitude,unique_key",
"$where" = paste0(
"complaint_type in('Noise - Commercial','Noise - Street/Sidewalk') ",
"AND descriptor = 'Loud Music/Party' ",
"AND created_date >= '", period$start, "T00:00:00' ",
"AND created_date < '", period$end, "T00:00:00'"
),
"$limit" = "50000",
"$offset" = as.character(offset)
)
resp <- GET(noise_base_url, query = query)
if (status_code(resp) != 200) break
raw_txt <- content(resp, "text", encoding = "UTF-8")
batch <- fromJSON(raw_txt, flatten = TRUE)
if (nrow(batch) == 0) break
period_data <- bind_rows(period_data, batch)
if (nrow(batch) < 50000) break
offset <- offset + 50000
Sys.sleep(0.5) # Rate limiting
}
noise_all <- bind_rows(noise_all, period_data)
}
saveRDS(noise_all, noise_local_file)
message("✓ Saved to: ", noise_local_file)
message("✓ Total records: ", nrow(noise_all))
}
# Clean noise data
noise_clean <- noise_all %>%
mutate(
created_date = ymd_hms(created_date),
latitude = as.numeric(latitude),
longitude = as.numeric(longitude),
incident_zip = str_pad(str_sub(as.character(incident_zip), 1, 5), 5, pad = "0"),
hour = hour(created_date),
year = year(created_date),
month = month(created_date),
time_period = if_else(hour >= 20 | hour < 4, "Nighttime (8PM-4AM)", "Daytime (4AM-8PM)")
) %>%
filter(!is.na(latitude), !is.na(longitude), !is.na(incident_zip))
message("✓ 311 records with valid coords + zip: ", nrow(noise_clean))
message("✓ Year range: ", min(noise_clean$year), "–", max(noise_clean$year))
message("✓ Time distribution: ",
noise_clean %>% count(time_period) %>%
{paste(.$time_period, .$n, sep = " = ", collapse = " | ")})