options(tigris_use_cache = TRUE) # set cache = TRUE to save time for future calls
zcta <- zctas(year = 2021) %>% # get all ZIPs in US (we can't filter by state unfortunately)
rename(zip_code = GEOID20) %>%
st_transform("EPSG:2272") %>% # re-project
filter(substr(ZCTA5CE20, start = 1, stop = 3) == "191") %>%
erase_water(area_threshold = 0.5)
zcta_bg <- zctas(year = 2021) %>% # get all ZIPs in US
rename(zip_code = GEOID20) %>%
st_transform("EPSG:2272") %>% # re-project
st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
erase_water(area_threshold = 0.5)
zip_list <- zcta$ZCTA5CE20 # list of zip codes to include in API query
phl_bound <- st_union(zcta) %>% # phl boundary
st_as_sf()
roads_pa <- primary_secondary_roads(state = "PA", year = 2021) %>%
st_transform("EPSG:2272") %>%
st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
erase_water()
roads_nj <- primary_secondary_roads(state = "NJ", year = 2021) %>%
st_transform("EPSG:2272") %>%
st_crop(st_bbox(zcta %>% st_buffer(5000))) %>%
erase_water()
roads <- rbind(roads_nj, roads_pa)Obtaining Yelp API Data
Setup
Libraries
Tigris boundaries
Yelp Fusion API
get_yelp() function
#### Yelp api call function ####
url <- "https://api.yelp.com/v3/businesses/search"
get_yelp = function(category, zip_code, offset_num) { # args are category of business, zipcode, and number to offset by
queryString = list(
location = zip_code, # argument to be filled
term = category, # argument to be filled
sort_by = "distance", # sort by dist
limit = 50, # 50 is the max for yelp fusion api, any higher and it won't work
offset = offset_num # argument to be filled
)
# use "GET" verb to request information from url
response <- VERB("GET", url, query = queryString,
add_headers('Authorization' = 'Bearer FrTAkRb4nKwMf5EdppcPMw11TY8XhgICjymVIWmu9IjOCK8FlKRb6x3im7DcBT038JJDqzeBrwWQM5__FeUnsXJL3e9-9TuIMqwGBJKJeLsbd2YbX0Rc0V28y_htZXYx'),
content_type("application/octet-stream"),
accept("application/json"))
# turn the response into a json file
yelp.json = content(response, "parsed", flatten = TRUE, simplify = TRUE)
# retrieve columns from json structure
biz.name = data.frame(yelp.json$businesses$name)
biz.lat = data.frame(yelp.json$businesses$coordinates.latitude)
biz.lon = data.frame(yelp.json$businesses$coordinates.longitude)
biz.rating = data.frame(yelp.json$businesses$rating)
biz.addr = data.frame(yelp.json$businesses$location.address1)
# bind the columns into one dataframe
yelp.df = cbind(biz.name, biz.rating, biz.addr, biz.lat, biz.lon) %>%
as.data.frame()
colnames(yelp.df) <- c("name", "rating", "address", "lat", "lon")
# add in category alias/title (this will give us cuisine information)
cuisine = yelp.json$businesses$categories
cuis.df <- map_dfr(cuisine, function(x) {
tibble(
alias = paste(x$alias, collapse = ", "),
title = paste(x$title, collapse = ", ")) %>%
as.data.frame()
})
yelp.df <- yelp.df %>%
cbind(cuis.df)
# # Replace NA values with ""
# yelp.df <- yelp.df %>%
# mutate(across(everything(), ~replace_na(.x, "")))
# When creating an empty dataframe, use "" as default value
if(nrow(yelp.df) == 0) {
yelp.df <- data.frame(name="", rating=numeric(0), address="", lat=numeric(0), lon=numeric(0), alias = "", title = "", stringsAsFactors=FALSE)
}
return(yelp.df)
}API call / data collection
# Initialize a named list of empty dataframes
initialize_named_dfs <- function(zips) {
empty_df <- data.frame(name=character(0), rating=numeric(0), address=character(0), lat=numeric(0), lon=numeric(0))
named_list <- lapply(zips, function(zip) empty_df)
names(named_list) <- zips
return(named_list)
}
biz_list_0 <- initialize_named_dfs(zip_list) # Initiate list of restaurant dataframes for each offset (since the query limit is 50, offset = 1 would return 51-100)
biz_list_1 <- initialize_named_dfs(zip_list) # 10 dfs x 50 restaurants each = a 500 restaurant sample per zip code
biz_list_2 <- initialize_named_dfs(zip_list)
biz_list_3 <- initialize_named_dfs(zip_list)
biz_list_4 <- initialize_named_dfs(zip_list)
biz_list_5 <- initialize_named_dfs(zip_list)
biz_list_6 <- initialize_named_dfs(zip_list)
biz_list_7 <- initialize_named_dfs(zip_list)
biz_list_8 <- initialize_named_dfs(zip_list)
biz_list_9 <- initialize_named_dfs(zip_list)
biz_list_10 <- initialize_named_dfs(zip_list)
# master list to store the dataframes
offset_list <- list(biz_list_0, biz_list_1, biz_list_2, biz_list_3,
biz_list_4, biz_list_5, biz_list_6, biz_list_7,
biz_list_8, biz_list_9, biz_list_10 )
# Loop through each offset (think of each offset as a page of results)
for (i in 1:length(zip_list)) {
# initialize zipnum (this is so we know where we're at in the list of zips)
zipnum <- 1
# initialize offset (so we can pull page 1, then page 2, then page 3, etc)
offset <- i-1
# Loop through each zip code
for (zip_code in zip_list) {
print(paste("batch ", offset + 1, ", ", "zip", zipnum, ": ", zip_code, sep = ""))
# Fetch Yelp data for the zip code and store it in the list
offset_list[[i]][[as.character(zip_code)]] <- get_yelp("historic landmarks and museums", as.character(zip_code), offset)
zipnum <- zipnum + 1 #iterate zipnum each loop
}
offset <- offset + 50
}Data preparation
Store response as dataframe
# Combine all dataframes into one dataframe and remove duplicates
biz.sf <- map_dfr(offset_list, ~ bind_rows(.x)) %>%
unique() %>%
filter(!is.na(lat)) %>%
st_as_sf(crs = 4326, coords = c("lon", "lat")) %>%
st_transform("EPSG:2272")Write geojson file
st_write(biz.sf, "historic_landmarks.geojson", driver = "geojson")Write boundary files
phl <- places(state = 42, year = 2022) %>%
filter(NAME == "Philadelphia")
st_write(phl, "phl_bound.geojson", driver = "geojson")